In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_data/final_processed_march28.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['school from home' , 'learn', 'remote', 'school food service', 
            'online shopping', 'online purchase', 'online church', 'delivery',
            'drive thru', 'to go', 'take out', 'Tiktok', 'Netflix', 'telework', 
            'zoom', 'telehealth', 'telemedicine', 'work from home', 'wfh',
            'working at home', 'working remotely', 'online meeting']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
interim_ada = data.apply(fuzzy_m, axis=1)

In [8]:
interim_ada

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,4.333995e+07,It’s about that time of the day that I start s...,time the day start softly sing baby one time ...,57,netflix
1,3.253824e+07,Rutgers NJMS medical students will graduate ea...,rutgers njms medical student graduate early to...,62,delivery
2,2.797009e+09,@sjbxtr So the tornado season has begun.. in t...,the tornado season begin the midst the coronav...,50,take out
3,1.000307e+08,Love in the time of Coronavirus. I will rememb...,love the time coronavirus remember time the bl...,100,delivery
4,2.353966e+07,The St. Regis Mohawk Tribe announces its first...,the st regis mohawk tribe announces first conf...,67,work at home
...,...,...,...,...,...
7919,1.155828e+18,White people think blacks are all thieves. So ...,white people think black thief aking black peo...,67,wfh
7920,1.683363e+07,There’s nothing great about having coronavirus...,nothing great coronavirus guess one the silver...,60,learn
7921,2.889925e+09,Think anyone here has it? https://t.co/pWZWfGM0hD,think anyone,50,tiktok
7922,1.541863e+07,Cheers and claps just filled the Manhattan sky...,cheer clap fill the manhattan skyline to appla...,62,telework


In [9]:
denominator = interim_ada.shape[0]

In [10]:
interim_ada['final_score'] = interim_ada['final_score'].astype(int)

In [11]:
interim_ada = interim_ada[interim_ada['final_score'] == 100]

In [12]:
numerator = interim_ada.shape[0]

In [13]:
interim_ada

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
3,1.000307e+08,Love in the time of Coronavirus. I will rememb...,love the time coronavirus remember time the bl...,100,delivery
32,2.497261e+07,Kaitlyn’s Kitchen is accepting to-go and deliv...,kaitlyn kitchen accept togo delivery order co...,100,delivery
88,7.305010e+17,Everyone who has an iPhone. There is now a scr...,everyone iphone screen tool your phone use eve...,100,telemedicine
212,1.085741e+18,I wish Corona would fucking try me cuz I am re...,wish corona would fuck try cuz ready coronavir...,100,netflix
278,1.882533e+07,"Physical not ""social"" distancing!\nWe need to ...",physical social distance need to share support...,100,remote
...,...,...,...,...,...
7780,1.823623e+07,I just spent over an hour doing my makeup to g...,spent hour makeup to go absolutely nowhere,100,to go
7843,1.196900e+04,"Feb: kids, you can have oat milk and 15 minute...",feb kid oat milk 15 minute planet earth mar g...,100,zoom
7874,1.194783e+08,I was mildly sick for a few days w/ covid symp...,mildly sick day w covid symptom 29 yr old heal...,100,learn
7883,8.189105e+07,"My nephew gave me this laptop, so that's my wo...",nephew give laptop thats work from home setup ...,100,work from home


In [14]:
proportion = (numerator/denominator)
print(proportion)

0.030035335689045935
