In [1]:
import pandas as pd
import numpy as np 
import csv
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/h01-20200818-10files/h01-20200818-153021.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
data.shape

(19992, 79)

In [4]:
data['lang'].value_counts()

en     11229
es      4317
pt      1518
ja       539
und      468
fr       338
tr       256
de       204
it       190
nl       181
ko       162
hi       153
ar        93
ca        55
tl        48
in        39
pl        38
zh        19
el        18
ru        16
ro        13
ht        10
sv        10
et        10
cs         8
sl         7
da         6
ur         6
cy         5
eu         5
th         5
fi         3
mr         3
bn         3
lv         3
ta         3
hu         2
no         2
ne         1
vi         1
fa         1
my         1
is         1
te         1
uk         1
Name: lang, dtype: int64

# Data Cleaning

##### is_quote --> False 

In [5]:
interim = data[np.where((data['is_quote']==False),True,False)].reset_index(drop=True)

KeyError: 'is_quote'

In [None]:
interim.shape

In [None]:
non_rep = interim[['user_id','text']]

##### is_quote --> True 

In [None]:
interim = data[np.where((data['is_quote']== True),True,False)].reset_index(drop=True)

In [None]:
interim.shape

In [None]:
interim = interim[['user_id','text']]

In [None]:
interim.columns = non_rep.columns

In [None]:
interim.shape

In [None]:
non_rep = non_rep.append(interim,ignore_index=True)

In [None]:
non_rep.shape

In [None]:
interim = data[np.where((data['is_quote']== True),True,False)].reset_index(drop=True)

In [None]:
interim.shape

In [None]:
interim[['quoted_text']]

In [None]:
interim = interim[['quoted_user_id','quoted_text']]

In [None]:
interim

In [None]:
interim = interim.rename(columns={'quoted_user_id':'user_id', 'quoted_text':'text'})

In [None]:
non_rep = non_rep.append(interim,ignore_index=True)

In [None]:
non_rep.shape

In [None]:
non_rep

# Analysis

In [None]:
import pandas as pd
import numpy as np 
import csv
import re
import string
import emoji

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])
#demoji.download_codes()

In [None]:
non_rep['text_duplicate'] = non_rep['text']

In [None]:
non_rep

In [None]:
#Convert to lower case
non_rep['text'] = non_rep['text'].str.lower()

In [None]:
#Removing emojis
def demoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00010000-\U0010ffff"
                               "]+", flags=re.UNICODE)
    return(emoji_pattern.sub(r'', text))

non_rep[u'text'] = non_rep[u'text'].astype(str)
non_rep[u'text'] = non_rep[u'text'].apply(lambda x:demoji(x))

In [None]:
#Remove URLs
non_rep['text'] = non_rep['text'].str.replace(r"http\S+| www\S+| https\S+| \S+\.com\S+| \S+\.com", "", regex=True)

In [None]:
#Remove user @
non_rep['text'] = non_rep['text'].str.replace(r'\@[\w]+', "", regex=True)

In [None]:
#Remove punctuations
non_rep['text'] = non_rep['text'].str.translate(str.maketrans("", "", string.punctuation))

In [None]:
#More Cleaning
non_rep['text']=non_rep['text'].astype(str).str.replace('/[^a-zA-Z0-9 ]/g', '', regex=True).str.replace('\n',' ', regex=True).str.replace('—',' ', regex=True).str.strip('“').str.strip('”').str.strip('’').str.lstrip(' ').str.rstrip(' ')

In [None]:
#Tokenizing
non_rep.text = non_rep.text.astype(str)
non_rep['tokenized_text'] = non_rep.apply(lambda row: nltk.word_tokenize(row.text), axis=1)

# remove stopwords
non_rep['filtered_text'] = non_rep['tokenized_text'].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
#Stemming
ps = PorterStemmer()
non_rep['stemmed_text'] = non_rep['filtered_text'].apply(lambda x: [ps.stem(y) for y in x])

In [None]:
#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
#Lemmatizing
lemmatizer = WordNetLemmatizer()
non_rep['lemmatized_text'] = non_rep['filtered_text'].apply(lambda x: [lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in x])

In [None]:
#Joining the lemmetized tokens to form string
non_rep['final'] = non_rep['lemmatized_text'].apply(lambda x: " ".join([word for word in x]))

In [None]:
#Remove punctuations
non_rep['final'] = non_rep['final'].str.translate(str.maketrans("", "", string.punctuation))
non_rep['final'] = non_rep['final'].str.replace("’", '').str.replace("“", '').str.replace("”", '')
#non_rep['text'] = non_rep['text'].astype(str).str.replace('/[^a-zA-Z0-9 ]/g', '', regex=True).str.replace('\n',' ', regex=True).str.replace('—',' ', regex=True).str.strip('“').str.strip('”').str.strip('’').str.lstrip(' ').str.rstrip(' ')
non_rep['final'] = non_rep['final'].astype(str).str.replace('/[^a-zA-Z0-9 ]/g', '', regex=True).str.replace('\n',' ', regex=True).str.replace('—',' ', regex=True).str.strip('“').str.strip('”').str.strip('’').str.lstrip(' ').str.rstrip(' ')

In [None]:
non_rep.drop_duplicates(['final']).shape

In [None]:
non_rep = non_rep.drop_duplicates(['final']).reset_index(drop=True)

In [None]:
non_rep.shape

In [None]:
non_rep

In [None]:
cols = [0,2, 7]
data_final = non_rep[non_rep.columns.values[cols]]
data_final.to_csv('/home/manikya_varshney/Documents/Python/Yale/h01-20200818-10files/final_h01-20200818-153021.csv',index=False)

In [None]:
data_final

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
temp=' '.join(data_final['final'].tolist())
wordcloud = WordCloud(width = 800, height = 500, background_color ='white', min_font_size = 10).generate(temp)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()