### Here I will document my method for cleaning my data files, which were taken from Kaggle's dataset “Comments on articles published in the New York Times” (https://www.kaggle.com/aashita/nyt-comments).
    
##### The first section of this code combines my data files into a single large file

In [1]:
import glob
import os
os.chdir("../Data")

import pandas as pd
import glob

interesting_files = glob.glob("../Comments*.csv")
df_list = []

if len(df_list) > 0: 

    for filename in sorted(interesting_files):
        df_list.append(pd.read_csv(filename))
    full_df = pd.concat(df_list)

    full_df.to_csv('allComments.csv', index=False)

interesting_files = glob.glob("Articles*.csv")
df_list = []

if len(df_list) > 0: 
    for filename in sorted(interesting_files):
        df_list.append(pd.read_csv(filename))
    full_df = pd.concat(df_list)

    full_df.to_csv('allArticles.csv', index=False)

### Initial goals: 

##### -Make sure the contents of each field are the correct type and have no missing data (i.e. scrub the 'NaN' from the 'abstract' field)

##### -Make sure that the data comes properly tokenized

##### -Convert all words to lowercase (to avoid confusion between uppercase and lowercase versions of the same word)

In [2]:
# Handle imports, then load the data file

import pandas as pd
import numpy as np

import re
import csv

import nltk
from nltk import sent_tokenize, word_tokenize, pos_tag


sent_token = nltk.sent_tokenize
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

train = pd.read_csv("allComments.csv")
train = train.dropna(how='all')
#train['recommendations'].head(5)
train.head(5)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,approveDate,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,parentID,...,userLocation,userTitle,userURL,inReplyTo,articleID,sectionName,newDesk,articleWordCount,printPage,typeOfMaterial
0,1491245000.0,This project makes me happy to be a 30+ year T...,22022598.0,22022598.0,<br/>,comment,1491237000.0,1.0,False,0.0,...,"Riverside, CA",,,0.0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2.0,News
1,1491189000.0,Stunning photos and reportage. Infuriating tha...,22017350.0,22017350.0,,comment,1491180000.0,1.0,False,0.0,...,<br/>,,,0.0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2.0,News
2,1491189000.0,Brilliant work from conception to execution. I...,22017334.0,22017334.0,<br/>,comment,1491179000.0,1.0,False,0.0,...,Raleigh NC,,,0.0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2.0,News
3,1491168000.0,NYT reporters should provide a contributor's l...,22015913.0,22015913.0,<br/>,comment,1491150000.0,1.0,False,0.0,...,"Missouri, USA",,,0.0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2.0,News
4,1491168000.0,Could only have been done in print. Stunning.,22015466.0,22015466.0,<br/>,comment,1491147000.0,1.0,False,0.0,...,"Tucson, Arizona",,,0.0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2.0,News


In [3]:
train= train.astype(str)
train.fillna(0)

### Note down which features are strings that need cleaning

strings = [1, 5, 10, 22, 24, 25, 26, 29, 30, 33]

from sklearn.feature_extraction.text import CountVectorizer

wn = nltk.WordNetLemmatizer()

# Define functions for cleaning

def lemmatize_text(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

def clean_articles(doc):
    for index, column in enumerate(doc):
        if index in strings: 
            
            # Remove NaN, uppercase.
            doc[column] = doc[column].str.replace('[^\w\s]','')
            doc[column] = doc[column].str.lower()
            doc[column] = doc[column].replace(np.nan, '', regex=True)
            
            # Tokenize text
            doc[column].apply(nltk.word_tokenize)
            doc[column].apply(lemmatize_text)
            doc[column].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
        else:
            doc[column] = doc[column].astype(str)
            continue
    return doc

clean_comments = clean_articles(train)
#The second command takes awhile to run
clean_comments.head(5)

clean_comments['recommendations'].head(5)

0    2.0
1    1.0
2    3.0
3    7.0
4    5.0
Name: recommendations, dtype: object

In [4]:
print("Any null values left: "), print(clean_comments.isnull().values.any())

Any null values left: 
False


(None, None)

In [5]:
print(len(clean_comments))
clean_comments.nunique()

4999


approveDate              2729
commentBody              4996
commentID                4999
commentSequence          4999
commentTitle                2
commentType                 2
createDate               4928
depth                       2
editorsSelection            2
parentID                  816
parentUserDisplayName     723
permID                   4999
picURL                    494
recommendations           268
recommendedFlag             1
replyCount                 22
reportAbuseFlag             1
sharing                     2
status                      1
timespeople                 1
trusted                     2
updateDate               2860
userDisplayName          3028
userID                   3577
userLocation             1511
userTitle                   1
userURL                     1
inReplyTo                 816
articleID                  13
sectionName                 6
newDesk                    10
articleWordCount           13
printPage                   6
typeOfMate

##### I see that there are no null values remaining, but looking at the dataframe I see that several columns contain nothing but 'nan' strings or otherwise have only one value. I want to drop the commentTitle (contains only '< br/ >' or nan), recommendedFlag, reportAbuseFlag, status, timespeople, userTitle and userURL columns.

In [6]:
clean_comments.drop(columns=['commentTitle', 'recommendedFlag', 'reportAbuseFlag', 'status', 'timespeople', 'userTitle', 'userURL'], axis=1, inplace=True)

clean_comments.head(5)

Unnamed: 0,approveDate,commentBody,commentID,commentSequence,commentType,createDate,depth,editorsSelection,parentID,parentUserDisplayName,...,userDisplayName,userID,userLocation,inReplyTo,articleID,sectionName,newDesk,articleWordCount,printPage,typeOfMaterial
0,1491245186.0,this project makes me happy to be a 30 year ti...,22022598.0,22022598.0,comment,1491237056.0,1.0,False,0.0,,...,rob gayle,46006296.0,riverside ca,0.0,58def1347c459f24986d7c80,unknown,insider,716.0,2.0,news
1,1491188619.0,stunning photos and reportage infuriating that...,22017350.0,22017350.0,comment,1491180489.0,1.0,False,0.0,,...,susan a,29202761.0,br,0.0,58def1347c459f24986d7c80,unknown,insider,716.0,2.0,news
2,1491188617.0,brilliant work from conception to execution iv...,22017334.0,22017334.0,comment,1491179470.0,1.0,False,0.0,,...,meta,63944806.0,raleigh nc,0.0,58def1347c459f24986d7c80,unknown,insider,716.0,2.0,news
3,1491167820.0,nyt reporters should provide a contributors li...,22015913.0,22015913.0,comment,1491150196.0,1.0,False,0.0,,...,tom wyrick,1266184.0,missouri usa,0.0,58def1347c459f24986d7c80,unknown,insider,716.0,2.0,news
4,1491167815.0,could only have been done in print stunning,22015466.0,22015466.0,comment,1491147284.0,1.0,False,0.0,,...,joe sharkey,61121360.0,tucson arizona,0.0,58def1347c459f24986d7c80,unknown,insider,716.0,2.0,news


###### Now let's save the cleaned file.

In [7]:
com_file_name = "cleaned_comment_data.csv"
clean_com_csv = clean_comments.to_csv(com_file_name, encoding='utf-8', index=False)