In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/Test Data/final_processed_april01.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['Flatten the curve' , 'Slow the spread', 'slow transmission', 
            'protect', 'save', '#stayhomesavelives']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
nan_value = float("NaN")
data['final'].replace("", nan_value, inplace=True)
data.dropna(subset = ["final"], inplace=True)

In [8]:
interim_purp = data.apply(fuzzy_m, axis=1)



In [9]:
interim_purp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,2.843399e+07,As Chicago households fill out 2020 census dur...,chicago household fill out 2020 census coronav...,57,protect
1,1.010439e+08,Most of the press corps is very busy covering ...,the press corp busy cover the really important...,57,protect
2,1.647527e+07,@realDonaldTrump Mar 10–he was very specific: ...,mar 10–he specific google 1700 engineer work ...,50,save
3,1.647527e+07,"@realDonaldTrump , You were caustic &amp; sarc...",caustic amp sarcastic the rollout come even f...,50,save
4,3.453977e+08,"Florida nears 8,000 coronavirus cases, as stat...",florida nears 8000 coronavirus case state repo...,50,save
...,...,...,...,...,...
16920,9.832288e+17,#SierraLeone has registered its index case of ...,sierraleone register index case covid19 from d...,71,protect
16921,1.856380e+07,KTAR News reporter @TaylorKinnerup gives a cor...,ktar news reporter give coronavirus update ari...,57,protect
16922,9.186279e+08,Part of a 1938 public health map: https://t.co...,part 1938 public health map,43,protect
16923,1.240031e+18,While the resourcefulness and hustle shown by ...,the resourcefulness hustle show many to produc...,59,flatten the curve


In [10]:
denominator = interim_purp.shape[0]

In [11]:
interim_purp['final_score'] = interim_purp['final_score'].astype(int)

In [12]:
interim_purp = interim_purp[interim_purp['final_score'] == 100]

In [13]:
interim_purp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
53,8.232312e+17,@Lyramydog True.\nBut the MILITARY is no longe...,true the military no longer protect the wh no ...,100,protect
272,9.668292e+08,This is infuriating!!! Such bullshit that an a...,infuriate bullshit antiquate policy would uphe...,100,save
302,1.005509e+18,@CDCgov @GovEvers \n\nReminder:\n\n#BrendanDas...,reminder brendandassey innocent help already ...,100,protect
330,9.501876e+08,*Wellness check!*\nFirst— Shout out to all the...,wellness check first shout out to the essentia...,100,stayhomesavelives
433,1.098554e+18,"COVID-19 is enough of a threat itself, and the...",covid19 enough threat the result spread lethal...,100,slow the spread
...,...,...,...,...,...
16790,3.034372e+07,The health order to stay home is extended thro...,the health order to stay home extend may 3 202...,100,slow the spread
16798,1.690826e+09,Hey @MTA @NYCTSubway @FeinbergSarah @NYGovCuom...,hey essential worker deliver much need n95 mas...,100,protect
16855,7.398442e+17,"March 11th. “It will go away; just stay calm, ...",march 11th go away stay calm go away to pro...,100,protect
16901,1.678039e+07,Stay Home. Stay Safe. Save Lives. The state of...,stay home stay safe save life the state michig...,100,save


In [14]:
numerator = interim_purp.shape[0]

In [15]:
proportion = (numerator/denominator)
print(proportion)

0.02446085672082718
