# Import Packages

In [1]:
import pandas as pd
import numpy as np 
import csv
import re
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings("ignore")

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
path = '/home/manikya_varshney/Documents/Python/Yale/final_h01-20200912-101538.csv'
data = pd.read_csv(path, low_memory=False)

In [4]:
data.shape

(6737, 3)

In [5]:
data

Unnamed: 0,id,user_id,extended_tweet_full_text
0,1.304786e+18,9.053900e+07,woke up to see if the justin bieber pandemic w...
1,1.304786e+18,8.497239e+17,@TeresaCCarter2 “Our intention is to make sure...
2,1.304786e+18,1.293830e+18,For More Information contact us. \nMail:- digi...
3,1.304786e+18,1.188902e+18,"UAE reports 1,007 new Covid-19 cases, highest ..."
4,1.304786e+18,2.273830e+08,Trump officials interfered with CDC reports on...
...,...,...,...
6732,1.304427e+18,6.874206e+07,Why did Twitter suddenly reinstate @clif_high?...
6733,1.304671e+18,8.323244e+17,Denna veckas COVID-19 veckorapport från Folkhä...
6734,1.304768e+18,4.446656e+09,Republicans Defend Trump After He Admitted Dow...
6735,1.301853e+18,3.914277e+08,The recession on the back of the Government's ...


In [6]:
data['extended_tweet_full_text']

0       woke up to see if the justin bieber pandemic w...
1       @TeresaCCarter2 “Our intention is to make sure...
2       For More Information contact us. \nMail:- digi...
3       UAE reports 1,007 new Covid-19 cases, highest ...
4       Trump officials interfered with CDC reports on...
                              ...                        
6732    Why did Twitter suddenly reinstate @clif_high?...
6733    Denna veckas COVID-19 veckorapport från Folkhä...
6734    Republicans Defend Trump After He Admitted Dow...
6735    The recession on the back of the Government's ...
6736    65 Catholics across the street only 7 wearing ...
Name: extended_tweet_full_text, Length: 6737, dtype: object

#####  1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [7]:
data['extended_tweet_full_text_duplicate'] = data['extended_tweet_full_text']

In [8]:
#Convert to lower case
data['extended_tweet_full_text'] = data['extended_tweet_full_text'].str.lower()

In [9]:
#Remove URLs
data['extended_tweet_full_text'] = data['extended_tweet_full_text'].str.replace(r"http\S+| www\S+| https\S+| \S+\.com\S+| \S+\.com", "")
#data['extended_tweet_full_text'] = data['extended_tweet_full_text'].str.replace(r"https?:\/\/\S* | www\S+ | \S+\.com\S+", "")


In [10]:
#Remove user @
data['extended_tweet_full_text'] = data['extended_tweet_full_text'].str.replace(r'\@[\w]+', "")

In [11]:
#Remove punctuations
data['extended_tweet_full_text'] = data['extended_tweet_full_text'].str.translate(str.maketrans("", "", string.punctuation))

In [12]:
#More Cleaning
data['extended_tweet_full_text']=data['extended_tweet_full_text'].astype(str).str.replace('/[^a-zA-Z0-9 ]/g', '').str.replace('\n','').str.strip('“').str.strip('’').str.lstrip(' ').str.rstrip(' ')

In [13]:
#Tokenizing
data.extended_tweet_full_text = data.extended_tweet_full_text.astype(str)
data['tokenized_extended_tweet_full_text'] = data.apply(lambda row: nltk.word_tokenize(row.extended_tweet_full_text), axis=1)

# remove stopwords
data['filtered_extended_tweet_full_text'] = data['tokenized_extended_tweet_full_text'].apply(lambda x: [word for word in x if word not in stop_words])

In [14]:
data

Unnamed: 0,id,user_id,extended_tweet_full_text,extended_tweet_full_text_duplicate,tokenized_extended_tweet_full_text,filtered_extended_tweet_full_text
0,1.304786e+18,9.053900e+07,woke up to see if the justin bieber pandemic w...,woke up to see if the justin bieber pandemic w...,"[woke, up, to, see, if, the, justin, bieber, p...","[woke, see, justin, bieber, pandemic, going, b..."
1,1.304786e+18,8.497239e+17,“our intention is to make sure that evidence s...,@TeresaCCarter2 “Our intention is to make sure...,"[“, our, intention, is, to, make, sure, that, ...","[“, intention, make, sure, evidence, scienceba..."
2,1.304786e+18,1.293830e+18,for more information contact us mailfollow on ...,For More Information contact us. \nMail:- digi...,"[for, more, information, contact, us, mailfoll...","[information, contact, us, mailfollow, 📷instag..."
3,1.304786e+18,1.188902e+18,uae reports 1007 new covid19 cases highest sin...,"UAE reports 1,007 new Covid-19 cases, highest ...","[uae, reports, 1007, new, covid19, cases, high...","[uae, reports, 1007, new, covid19, cases, high..."
4,1.304786e+18,2.273830e+08,trump officials interfered with cdc reports on...,Trump officials interfered with CDC reports on...,"[trump, officials, interfered, with, cdc, repo...","[trump, officials, interfered, cdc, reports, c..."
...,...,...,...,...,...,...
6732,1.304427e+18,6.874206e+07,why did twitter suddenly reinstate could it be...,Why did Twitter suddenly reinstate @clif_high?...,"[why, did, twitter, suddenly, reinstate, could...","[twitter, suddenly, reinstate, could, science,..."
6733,1.304671e+18,8.323244e+17,denna veckas covid19 veckorapport från folkhäl...,Denna veckas COVID-19 veckorapport från Folkhä...,"[denna, veckas, covid19, veckorapport, från, f...","[denna, veckas, covid19, veckorapport, från, f..."
6734,1.304768e+18,4.446656e+09,republicans defend trump after he admitted dow...,Republicans Defend Trump After He Admitted Dow...,"[republicans, defend, trump, after, he, admitt...","[republicans, defend, trump, admitted, downpla..."
6735,1.301853e+18,3.914277e+08,the recession on the back of the governments h...,The recession on the back of the Government's ...,"[the, recession, on, the, back, of, the, gover...","[recession, back, governments, handling, covid..."


In [15]:
#Stemming
ps = PorterStemmer()
data['stemmed_extended_tweet_full_text'] = data['filtered_extended_tweet_full_text'].apply(lambda x: [ps.stem(y) for y in x])

In [16]:
data

Unnamed: 0,id,user_id,extended_tweet_full_text,extended_tweet_full_text_duplicate,tokenized_extended_tweet_full_text,filtered_extended_tweet_full_text,stemmed_extended_tweet_full_text
0,1.304786e+18,9.053900e+07,woke up to see if the justin bieber pandemic w...,woke up to see if the justin bieber pandemic w...,"[woke, up, to, see, if, the, justin, bieber, p...","[woke, see, justin, bieber, pandemic, going, b...","[woke, see, justin, bieber, pandem, go, back, ..."
1,1.304786e+18,8.497239e+17,“our intention is to make sure that evidence s...,@TeresaCCarter2 “Our intention is to make sure...,"[“, our, intention, is, to, make, sure, that, ...","[“, intention, make, sure, evidence, scienceba...","[“, intent, make, sure, evid, sciencebas, data..."
2,1.304786e+18,1.293830e+18,for more information contact us mailfollow on ...,For More Information contact us. \nMail:- digi...,"[for, more, information, contact, us, mailfoll...","[information, contact, us, mailfollow, 📷instag...","[inform, contact, us, mailfollow, 📷instagramgy..."
3,1.304786e+18,1.188902e+18,uae reports 1007 new covid19 cases highest sin...,"UAE reports 1,007 new Covid-19 cases, highest ...","[uae, reports, 1007, new, covid19, cases, high...","[uae, reports, 1007, new, covid19, cases, high...","[uae, report, 1007, new, covid19, case, highes..."
4,1.304786e+18,2.273830e+08,trump officials interfered with cdc reports on...,Trump officials interfered with CDC reports on...,"[trump, officials, interfered, with, cdc, repo...","[trump, officials, interfered, cdc, reports, c...","[trump, offici, interf, cdc, report, covid19, ..."
...,...,...,...,...,...,...,...
6732,1.304427e+18,6.874206e+07,why did twitter suddenly reinstate could it be...,Why did Twitter suddenly reinstate @clif_high?...,"[why, did, twitter, suddenly, reinstate, could...","[twitter, suddenly, reinstate, could, science,...","[twitter, suddenli, reinstat, could, scienc, v..."
6733,1.304671e+18,8.323244e+17,denna veckas covid19 veckorapport från folkhäl...,Denna veckas COVID-19 veckorapport från Folkhä...,"[denna, veckas, covid19, veckorapport, från, f...","[denna, veckas, covid19, veckorapport, från, f...","[denna, vecka, covid19, veckorapport, från, fo..."
6734,1.304768e+18,4.446656e+09,republicans defend trump after he admitted dow...,Republicans Defend Trump After He Admitted Dow...,"[republicans, defend, trump, after, he, admitt...","[republicans, defend, trump, admitted, downpla...","[republican, defend, trump, admit, downplay, t..."
6735,1.301853e+18,3.914277e+08,the recession on the back of the governments h...,The recession on the back of the Government's ...,"[the, recession, on, the, back, of, the, gover...","[recession, back, governments, handling, covid...","[recess, back, govern, handl, covid, econom, h..."


In [17]:
data

Unnamed: 0,id,user_id,extended_tweet_full_text,extended_tweet_full_text_duplicate,tokenized_extended_tweet_full_text,filtered_extended_tweet_full_text,stemmed_extended_tweet_full_text
0,1.304786e+18,9.053900e+07,woke up to see if the justin bieber pandemic w...,woke up to see if the justin bieber pandemic w...,"[woke, up, to, see, if, the, justin, bieber, p...","[woke, see, justin, bieber, pandemic, going, b...","[woke, see, justin, bieber, pandem, go, back, ..."
1,1.304786e+18,8.497239e+17,“our intention is to make sure that evidence s...,@TeresaCCarter2 “Our intention is to make sure...,"[“, our, intention, is, to, make, sure, that, ...","[“, intention, make, sure, evidence, scienceba...","[“, intent, make, sure, evid, sciencebas, data..."
2,1.304786e+18,1.293830e+18,for more information contact us mailfollow on ...,For More Information contact us. \nMail:- digi...,"[for, more, information, contact, us, mailfoll...","[information, contact, us, mailfollow, 📷instag...","[inform, contact, us, mailfollow, 📷instagramgy..."
3,1.304786e+18,1.188902e+18,uae reports 1007 new covid19 cases highest sin...,"UAE reports 1,007 new Covid-19 cases, highest ...","[uae, reports, 1007, new, covid19, cases, high...","[uae, reports, 1007, new, covid19, cases, high...","[uae, report, 1007, new, covid19, case, highes..."
4,1.304786e+18,2.273830e+08,trump officials interfered with cdc reports on...,Trump officials interfered with CDC reports on...,"[trump, officials, interfered, with, cdc, repo...","[trump, officials, interfered, cdc, reports, c...","[trump, offici, interf, cdc, report, covid19, ..."
...,...,...,...,...,...,...,...
6732,1.304427e+18,6.874206e+07,why did twitter suddenly reinstate could it be...,Why did Twitter suddenly reinstate @clif_high?...,"[why, did, twitter, suddenly, reinstate, could...","[twitter, suddenly, reinstate, could, science,...","[twitter, suddenli, reinstat, could, scienc, v..."
6733,1.304671e+18,8.323244e+17,denna veckas covid19 veckorapport från folkhäl...,Denna veckas COVID-19 veckorapport från Folkhä...,"[denna, veckas, covid19, veckorapport, från, f...","[denna, veckas, covid19, veckorapport, från, f...","[denna, vecka, covid19, veckorapport, från, fo..."
6734,1.304768e+18,4.446656e+09,republicans defend trump after he admitted dow...,Republicans Defend Trump After He Admitted Dow...,"[republicans, defend, trump, after, he, admitt...","[republicans, defend, trump, admitted, downpla...","[republican, defend, trump, admit, downplay, t..."
6735,1.301853e+18,3.914277e+08,the recession on the back of the governments h...,The recession on the back of the Government's ...,"[the, recession, on, the, back, of, the, gover...","[recession, back, governments, handling, covid...","[recess, back, govern, handl, covid, econom, h..."


In [18]:
#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
#Lemmatizing
lemmatizer = WordNetLemmatizer()
data['lemmatized_extended_tweet_full_text'] = data['filtered_extended_tweet_full_text'].apply(lambda x: [lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in x])

In [None]:
data

In [None]:
#Joining the lemmetized tokens to form string
data['final'] = data['lemmatized_extended_tweet_full_text'].apply(lambda x: " ".join([word for word in x]))

In [None]:
data

In [None]:
cols = [0, 1, 2,3, 8]
data_final = data[data.columns.values[cols]]
data_final.to_csv('final_processed_h01-20200912-101538.csv',index=False)

In [None]:
data_final

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
temp=' '.join(data['final'].tolist())
wordcloud = WordCloud(width = 800, height = 500, background_color ='white', min_font_size = 10).generate(temp)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.show()