In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_data/final_processed_april10.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['bored' , 'lonely', 'stress', 
            'anxiety', 'scared', 'worry', 'end', 'cabin fever',
            '#sideeffectsofquarantinelife', 'tissue paper', 'toilet paper']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
interim_ne = data.apply(fuzzy_m, axis=1)

In [8]:
interim_ne

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,2.879999e+09,@MeetThePress @chucktodd If Trump is doing suc...,trump great job the usa 30 the global covid19 ...,50,lonely
1,1.631563e+09,"@MyCielola, @FIOB_Oficial, @MayanLeague among ...",among indigenous organization denounce discrim...,67,end
2,4.175449e+08,It’s spring break and I’m supposed to be visit...,spring break suppose to visit family tuscaloo...,75,bore
3,1.003550e+09,One of my hardworking pharmacy techs fell ill ...,one hardworking pharmacy tech fell ill today c...,60,worry
4,1.238635e+18,Looking for a quarantine boyfriend. Just text ...,look quarantine boyfriend text good morning go...,100,end
...,...,...,...,...,...
12993,3.821768e+08,Tonight’s virtual town hall with the RNS COVID...,tonight virtual town hall the rn covid19 task...,50,lonely
12994,1.230608e+18,ICYMI: Commissioner @HodgenMainda is featured ...,icymi commissioner feature the new issue the t...,67,end
12995,1.238883e+18,COVID Insurance Update 4/9/2020 https://t.co/j...,covid insurance update 492020 via,50,scar
12996,1.039776e+08,Good thing there are no rush hours these days....,good thing no rush hour day sq issue 1546 fine...,75,scar


In [9]:
denominator = interim_ne.shape[0]

In [10]:
interim_ne['final_score'] = interim_ne['final_score'].astype(int)

In [11]:
interim_ne = interim_ne[interim_ne['final_score'] == 100]

In [12]:
interim_ne

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
4,1.238635e+18,Looking for a quarantine boyfriend. Just text ...,look quarantine boyfriend text good morning go...,100,end
20,7.504820e+17,These girls are taking a little Coronavirus br...,girl take little coronavirus break hit the res...,100,end
22,1.907757e+07,I think I hit the #coronavirus jackpot. In jus...,think hit the coronavirus jackpot two trip man...,100,toilet paper
24,1.229759e+18,@TimNoEgo @FrakerMonica What Trump means is th...,trump mean grueling to stretch out watch quara...,100,cabin fever
37,2.885170e+08,Time to curl up with that glass of wine 🍷 or t...,time to curl glass wine cup amp catch the late...,100,end
...,...,...,...,...,...
12860,3.173852e+08,"@JoeBiden You’re right, we should send the ent...",right send the entire bill to china,100,end
12865,3.160511e+06,Big formula companies told me they couldn’t he...,big formula company told help u see send new...,100,end
12878,8.364905e+07,"Also: I deliberately left out Gwenpool, Dr. St...",also deliberately left out gwenpool dr strange...,100,scar
12964,2.427755e+07,JUST IN: 72% of AMERICANS say that they would ...,72 american say would attend game coronavirus ...,100,end


In [13]:
numerator = interim_ne.shape[0]

In [14]:
proportion = (numerator/denominator)
print(proportion)

0.062317279581474073
