In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_data/final_processed_april01.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['school from home' , 'learn', 'remote', 'school food service', 
            'online shopping', 'online purchase', 'online church', 'delivery',
            'drive thru', 'to go', 'take out', 'Tiktok', 'Netflix', 'telework', 
            'zoom', 'telehealth', 'telemedicine', 'work from home', 'wfh',
            'working at home', 'working remotely', 'online meeting']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
interim_ada = data.apply(fuzzy_m, axis=1)

In [8]:
interim_ada

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,28433993,As Chicago households fill out 2020 census dur...,chicago household fill out 2020 census coronav...,50,school from home
1,101043870,Most of the press corps is very busy covering ...,the press corp busy cover the really important...,75,take out
2,16475267,@realDonaldTrump Mar 10–he was very specific: ...,mar 10–he specific google 1700 engineer work ...,75,telework
3,16475267,"@realDonaldTrump , You were caustic &amp; sarc...",caustic amp sarcastic the rollout come even f...,60,to go
4,345397708,"Florida nears 8,000 coronavirus cases, as stat...",florida nears 8000 coronavirus case state repo...,62,telework
...,...,...,...,...,...
16185,751644712570847236,If in 1938 Turkey’s government could produce a...,1938 turkey government could produce health m...,70,telehealth
16186,286628737,Sandy Medford was a friend of my family for de...,sandy medford friend family decade suspect hea...,60,learn
16187,361010205,One month ago Trump claimed the number of Amer...,one month ago trump claimed the number america...,60,to go
16188,481389842,Yes. This. Especially if a substantial proport...,yes especially substantial proportion workforc...,60,to go


In [9]:
denominator = interim_ada.shape[0]

In [10]:
interim_ada['final_score'] = interim_ada['final_score'].astype(int)

In [11]:
interim_ada = interim_ada[interim_ada['final_score'] == 100]

In [12]:
numerator = interim_ada.shape[0]

In [13]:
interim_ada

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
38,859016395,A new normal for Congress:\n- working from hom...,new normal congress work from home coronavirus...,100,work from home
77,1074530978,"Hello Twitter, I have a rant. I work at Pizza ...",hello twitter rant work at pizza hut throughou...,100,delivery
92,232044555,A1 @a1softball schooling from home (Covid-19) ...,a1 school from home covid19 id auto shop 101 t...,100,school from home
101,743183764130398208,Nothing has done more for a social media appli...,nothing do social medium application covid19 a...,100,tiktok
102,256650910,Question 🤔: After this whole #coronavirus pand...,question whole coronavirus pandemic do think c...,100,work remotely
...,...,...,...,...,...
15867,1096052747315625984,Mar 2020:\n-1st time sleeping in on a sat. w/ ...,mar 2020 1st time sleep sat w no obligation of...,100,remote
15964,915288902,Be careful with your #Amazon orders! See my Fa...,careful your amazon order see facebook post cl...,100,delivery
16133,2849624090,Don't know why they're trying to blame this la...,dont know theyre try to blame lady cancel mard...,100,learn
16138,516835849,This Tik-Tok pandemic could be worse than COVI...,tiktok pandemic could bad covid,100,tiktok


In [14]:
proportion = (numerator/denominator)
print(proportion)

0.03057442865966646
