In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_data/final_processed_april10.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['school from home' , 'learn', 'remote', 'school food service', 
            'online shopping', 'online purchase', 'online church', 'delivery',
            'drive thru', 'to go', 'take out', 'Tiktok', 'Netflix', 'telework', 
            'zoom', 'telehealth', 'telemedicine', 'work from home', 'wfh',
            'working at home', 'working remotely', 'online meeting']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [None]:
nan_value = float("NaN")
data['final'].replace("", nan_value, inplace=True)
data.dropna(subset = ["final"], inplace=True)

In [7]:
interim_ada = data.apply(fuzzy_m, axis=1)

In [8]:
interim_ada

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,2.879999e+09,@MeetThePress @chucktodd If Trump is doing suc...,trump great job the usa 30 the global covid19 ...,60,learn
1,1.631563e+09,"@MyCielola, @FIOB_Oficial, @MayanLeague among ...",among indigenous organization denounce discrim...,50,tiktok
2,4.175449e+08,It’s spring break and I’m supposed to be visit...,spring break suppose to visit family tuscaloo...,60,to go
3,1.003550e+09,One of my hardworking pharmacy techs fell ill ...,one hardworking pharmacy tech fell ill today c...,60,online shopping
4,1.238635e+18,Looking for a quarantine boyfriend. Just text ...,look quarantine boyfriend text good morning go...,80,to go
...,...,...,...,...,...
12993,3.821768e+08,Tonight’s virtual town hall with the RNS COVID...,tonight virtual town hall the rn covid19 task...,64,work from home
12994,1.230608e+18,ICYMI: Commissioner @HodgenMainda is featured ...,icymi commissioner feature the new issue the t...,60,drive thru
12995,1.238883e+18,COVID Insurance Update 4/9/2020 https://t.co/j...,covid insurance update 492020 via,50,take out
12996,1.039776e+08,Good thing there are no rush hours these days....,good thing no rush hour day sq issue 1546 fine...,80,drive thru


In [9]:
denominator = interim_ada.shape[0]

In [10]:
interim_ada['final_score'] = interim_ada['final_score'].astype(int)

In [11]:
interim_ada = interim_ada[interim_ada['final_score'] == 100]

In [12]:
numerator = interim_ada.shape[0]

In [13]:
interim_ada

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
16,6.074032e+08,The DoorDash delivery people I’ve interacted w...,the doordash delivery people interact past we...,100,delivery
97,4.364808e+07,Ended my 4th week of working from home. I'm #g...,end 4th week work from home im grateful good j...,100,work from home
141,9.206726e+17,Bored in the house 🏡 #coronavirus #QuarantineL...,bore the house coronavirus quarantinelife tiktok,100,tiktok
152,8.399553e+07,FCC Commissioner Brendan Carr (@BrendanCarrFCC...,fcc commissioner brendan carr right want ame...,100,telehealth
153,7.535869e+17,Who is not tired of this @realDonaldTrump yo-y...,tire yoyo decision make process tire to lack l...,100,to go
...,...,...,...,...,...
12887,1.598221e+08,I'm taking advantage of recent event cancellat...,im take advantage recent event cancellation le...,100,learn
12911,3.221354e+07,Q4) Many of us are now participating in self-d...,q4 many u participate selfdirected learn exper...,100,learn
12960,9.765427e+08,"3/ Yesterday, I issued a Mayor’s Order that cr...",3 yesterday issue mayor order creates new pro...,100,learn
12970,2.162159e+08,.@GovParsonMO announced that Missouri schools ...,announce missouri school remain close school y...,100,to go


In [14]:
proportion = (numerator/denominator)
print(proportion)

0.027619633789813816
