In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
import sklearn

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_data/final_processed_april01.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
data

Unnamed: 0,user_id,text_duplicate,final
0,28433993,As Chicago households fill out 2020 census dur...,chicago household fill out 2020 census coronav...
1,101043870,Most of the press corps is very busy covering ...,the press corp busy cover the really important...
2,16475267,@realDonaldTrump Mar 10–he was very specific: ...,mar 10–he specific google 1700 engineer work ...
3,16475267,"@realDonaldTrump , You were caustic &amp; sarc...",caustic amp sarcastic the rollout come even f...
4,345397708,"Florida nears 8,000 coronavirus cases, as stat...",florida nears 8000 coronavirus case state repo...
...,...,...,...
16185,751644712570847236,If in 1938 Turkey’s government could produce a...,1938 turkey government could produce health m...
16186,286628737,Sandy Medford was a friend of my family for de...,sandy medford friend family decade suspect hea...
16187,361010205,One month ago Trump claimed the number of Amer...,one month ago trump claimed the number america...
16188,481389842,Yes. This. Especially if a substantial proport...,yes especially substantial proportion workforc...


In [4]:
keywords = ['stay at home' , 'do your part', 'Responsible', 
            'home', 'house', 'cancel', 'shutdown', 'postpone',
            'school closure', 'Closure', 'business closure',
            'suspension', 'quarantine', 'lockdown', 'social distance', 
            'social distancing', 'self quarantine', 'isolat', '6-feet',
            'distance', '#clubquarantine', '#quarantinelife', '#quarantineacitivites']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [5]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [6]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [7]:
def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [8]:
interim_imp = data.apply(fuzzy_m, axis=1)

In [9]:
interim_imp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,28433993,As Chicago households fill out 2020 census dur...,chicago household fill out 2020 census coronav...,100,house
1,101043870,Most of the press corps is very busy covering ...,the press corp busy cover the really important...,100,home
2,16475267,@realDonaldTrump Mar 10–he was very specific: ...,mar 10–he specific google 1700 engineer work ...,60,house
3,16475267,"@realDonaldTrump , You were caustic &amp; sarc...",caustic amp sarcastic the rollout come even f...,75,home
4,345397708,"Florida nears 8,000 coronavirus cases, as stat...",florida nears 8000 coronavirus case state repo...,53,social distance
...,...,...,...,...,...
16185,751644712570847236,If in 1938 Turkey’s government could produce a...,1938 turkey government could produce health m...,75,home
16186,286628737,Sandy Medford was a friend of my family for de...,sandy medford friend family decade suspect hea...,100,home
16187,361010205,One month ago Trump claimed the number of Amer...,one month ago trump claimed the number america...,60,house
16188,481389842,Yes. This. Especially if a substantial proport...,yes especially substantial proportion workforc...,62,postpone


In [10]:
denominator = interim_imp.shape[0]

In [11]:
interim_imp['final_score'] = interim_imp['final_score'].astype(int)

In [12]:
interim_imp = interim_imp[interim_imp['final_score'] == 100]

In [13]:
interim_imp

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,28433993,As Chicago households fill out 2020 census dur...,chicago household fill out 2020 census coronav...,100,house
1,101043870,Most of the press corps is very busy covering ...,the press corp busy cover the really important...,100,home
19,95027069,How are we at #jpcarpet spending our #quaranti...,at jpcarpet spending quarantine building close...,100,home
27,29292924,Happy #NationalPoetryMonth - new pandemic poet...,happy nationalpoetrymonth new pandemic poetry ...,100,quarantine
30,1245498747351687171,I know breakups during this Coronavirus Isolat...,know breakup coronavirus isolation hit hard th...,100,isolat
...,...,...,...,...,...
16160,113667535,"MI has been home for me for last 5 years , i p...",mi home last 5 year pray amd declare the goodn...,100,home
16164,5413092,Portraiture #COVID #QuarantineLife style. #hom...,portraiture covid quarantinelife style homemus...,100,home
16178,1625824098,This right here is the problem. We have no #Co...,right the problem no consistency government t...,100,responsible
16182,33677778,"CNN anchor @ChrisCuomo, brother of @NYGovCuomo...",cnn anchor brother test positive coronavirus c...,100,quarantine


In [14]:
numerator = interim_imp.shape[0]

In [15]:
proportion = (numerator/denominator)
print(proportion)

0.15589870290302657
