In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
import sklearn

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_data/final_processed_april08.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
data

Unnamed: 0,user_id,text_duplicate,final
0,2.612814e+09,@paul_pitlyk @UrKillinMePetey Yes - we usually...,yes usually go together since the covid stuff ...
1,7.790337e+17,@scotyvonnendave @MeatBrothers1 @TheKosh66 @Du...,look great homemade meatball sub cookingisther...
2,1.232162e+18,CDC guidance says some essential workers expos...,cdc guidance say essential worker expose to co...
3,1.232162e+18,Pompeo says State Dept. has repatriated more t...,pompeo say state dept repatriate 50000 u citiz...
4,5.565826e+08,At this point I’m not even wearing a mask or ...,at point even wear mask glove to protect from...
...,...,...,...
14573,2.675937e+07,And of course the theater reaction to PORTALS ...,course the theater reaction to portal bonkers
14574,1.084921e+18,We're pretty bullish on this prediction: the C...,pretty bullish prediction the covid19 crisis t...
14575,2.153201e+09,Has anyone asked Gwyneth Paltrow what we shoul...,anyone ask gwyneth paltrow
14576,3.164441e+09,The line to vote at Riverside High School in M...,the line to vote at riverside high school milw...


In [4]:
keywords = ['stay at home' , 'do your part', 'Responsible', 
            'home', 'house', 'cancel', 'shutdown', 'postpone',
            'school closure', 'Closure', 'business closure',
            'suspension', 'quarantine', 'lockdown', 'social distance', 
            'social distancing', 'self quarantine', 'isolat', '6-feet',
            'distance', '#clubquarantine', '#quarantinelife', '#quarantineacitivites']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [5]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [6]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [7]:
def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [8]:
interim_imp = data.apply(fuzzy_m, axis=1)

In [9]:
interim_imp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,2.612814e+09,@paul_pitlyk @UrKillinMePetey Yes - we usually...,yes usually go together since the covid stuff ...,50,stay at home
1,7.790337e+17,@scotyvonnendave @MeatBrothers1 @TheKosh66 @Du...,look great homemade meatball sub cookingisther...,100,home
2,1.232162e+18,CDC guidance says some essential workers expos...,cdc guidance say essential worker expose to co...,67,cancel
3,1.232162e+18,Pompeo says State Dept. has repatriated more t...,pompeo say state dept repatriate 50000 u citiz...,75,home
4,5.565826e+08,At this point I’m not even wearing a mask or ...,at point even wear mask glove to protect from...,58,do your part
...,...,...,...,...,...
14573,2.675937e+07,And of course the theater reaction to PORTALS ...,course the theater reaction to portal bonkers,80,house
14574,1.084921e+18,We're pretty bullish on this prediction: the C...,pretty bullish prediction the covid19 crisis t...,100,home
14575,2.153201e+09,Has anyone asked Gwyneth Paltrow what we shoul...,anyone ask gwyneth paltrow,50,home
14576,3.164441e+09,The line to vote at Riverside High School in M...,the line to vote at riverside high school milw...,83,do your part


In [10]:
denominator = interim_imp.shape[0]

In [11]:
interim_imp['final_score'] = interim_imp['final_score'].astype(int)

In [12]:
interim_imp = interim_imp[interim_imp['final_score'] == 100]

In [13]:
interim_imp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
1,7.790337e+17,@scotyvonnendave @MeatBrothers1 @TheKosh66 @Du...,look great homemade meatball sub cookingisther...,100,home
7,6.758222e+06,Does someone want to give me COVID-19? Then I ...,someone want to give covid19 could isolate sto...,100,isolat
16,7.248031e+17,I think this man works for China since he lies...,think man work china since lie even well do ‘ ...,100,responsible
25,2.987983e+08,Yeah! #Nantucket\nSTAY HOME SAFE LIVES\nCenter...,yeah nantucket stay home safe life center dise...,100,home
31,3.681214e+07,Are you comfortable while you’re at home? I ho...,comfortable at home hope the family safe atla...,100,home
...,...,...,...,...,...
14539,3.834918e+07,CA understands the anxiety resulting from stay...,ca understands the anxiety result from stay at...,100,stay at home
14548,1.104669e+18,I’ve had to take care of my very sick grandmot...,to take care sick grandmother 4 day nowhere to...,100,home
14552,6.859810e+07,Announcing the 2020 NBA2K HS Hoops Charity Tou...,announce the 2020 nba2k h hoop charity tournam...,100,social distance
14560,2.668168e+08,They’ve been on lockdown for 3 weeks. How coul...,lockdown 3 week could many new case even possible,100,lockdown


In [14]:
numerator = interim_imp.shape[0]

In [15]:
proportion = (numerator/denominator)
print(proportion)

0.12587460557003705
