In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_data/final_processed_april10.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['Flatten the curve' , 'Slow the spread', 'slow transmission', 
            'protect', 'save', '#stayhomesavelives']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [None]:
nan_value = float("NaN")
data['final'].replace("", nan_value, inplace=True)
data.dropna(subset = ["final"], inplace=True)

In [7]:
interim_purp = data.apply(fuzzy_m, axis=1)

In [8]:
interim_purp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,25610446.0,@NYGovCuomo for President!\n\n#TrumpIsTheWORST...,president trumpistheworstpresidentever trumppr...,50,save
1,25610446.0,How the government delayed coronavirus testing...,the government delayed coronavirus test covid1...,50,save
2,189942678.0,@ajkatztv @weijia @tvnewser Weijia is the best...,weijia the best at cover the white house come ...,59,flatten the curve
3,189942678.0,Stay with @CBSNews on-air and online with cont...,stay onair online continue live coverage the c...,59,flatten the curve
4,378143915.0,We are developing muscle memory for the next v...,develop muscle memory the next virus covid 19 ...,53,slow the spread
...,...,...,...,...,...
14440,1.2450977338888847e+18,Caption this https://t.co/lN8jGxOm9J,caption,43,flatten the curve
14441,30233254.0,“We have to make sure that all of our equipmen...,to make sure equipment vehicle cot facility in...,53,slow the spread
14442,22656149.0,There’s something bothering Bronx Borough Pres...,something bother bronx borough president want ...,53,slow the spread
14443,1.2479186312180326e+18,#SocialDistanacing #Boostmobile @HOTROD276 htt...,socialdistanacing boostmobile,41,slow transmission


In [9]:
denominator = interim_purp.shape[0]

In [10]:
interim_purp['final_score'] = interim_purp['final_score'].astype(int)

In [11]:
interim_purp = interim_purp[interim_purp['final_score'] == 100]

In [12]:
interim_purp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
5,733939050.0,Stay vigilant and we can flatten the curve! #F...,stay vigilant flatten the curve flattenthecurv...,100,flatten the curve
151,41378381.0,"45's economy was a mirage, forced by massive U...",45s economy mirage force massive u debt amp mi...,100,protect
188,46039595.0,The City of Huntington Beach closes beachside ...,the city huntington beach close beachside mete...,100,slow the spread
247,33333620.0,Healthcare workers are taping photos of themse...,healthcare worker tap photo to protective gear...,100,protect
253,399058061.0,Grocery stores ramping up safety to protect cu...,grocery store ramp safety to protect customer ...,100,protect
...,...,...,...,...,...
14201,25073877.0,FLATTENING OF THE CURVE!,flatten the curve,100,flatten the curve
14224,10688432.0,While Gov. Laura Kelly has worked tirelessly f...,gov laura kelly work tirelessly week to protec...,100,protect
14232,26113063.0,Friday April 10 1pm EDT Join @ElizabethNgonzi ...,friday april 10 1pm edt join the flexibility t...,100,save
14274,2798659231.0,"""@ACOG has put amazing things in place to prot...",put amaze thing place to protect every pregnan...,100,protect


In [13]:
numerator = interim_purp.shape[0]

In [14]:
proportion = (numerator/denominator)
print(proportion)

0.021737625475943233
