In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_april01.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['social functions' , 'gathering', 'empty streets', 
            'interaction', 'large', 'no cars', 'non-essential',
            'travel', 'unnecessary', 'crowd',]

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

# Testing

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
interim_sd = data.apply(fuzzy_m, axis=1)

In [8]:
interim_sd

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,28433993,As Chicago households fill out 2020 census dur...,chicago household fill out 2020 census coronav...,55,interaction
1,101043870,Most of the press corps is very busy covering ...,the press corp busy cover the really important...,67,travel
2,16475267,@realDonaldTrump Mar 10–he was very specific: ...,mar 10–he specific google 1700 engineer work ...,60,large
3,16475267,"@realDonaldTrump , You were caustic &amp; sarc...",caustic amp sarcastic the rollout come even f...,73,interaction
4,345397708,"Florida nears 8,000 coronavirus cases, as stat...",florida nears 8000 coronavirus case state repo...,60,crowd
...,...,...,...,...,...
16185,751644712570847236,If in 1938 Turkey’s government could produce a...,1938 turkey government could produce health m...,60,large
16186,286628737,Sandy Medford was a friend of my family for de...,sandy medford friend family decade suspect hea...,50,no car
16187,361010205,One month ago Trump claimed the number of Amer...,one month ago trump claimed the number america...,60,large
16188,481389842,Yes. This. Especially if a substantial proport...,yes especially substantial proportion workforc...,60,large


In [9]:
interim_sd['final_score'] = interim_sd['final_score'].astype(int)

In [10]:
interim_sd[interim_sd['final_score'] == 100]

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
154,318592768,"Governor DeSantis, STOP caving to politics..li...",governor desantis stop cave to politicslisten ...,100,large
197,31676696,Coronavirus has led to sweeping travel restric...,coronavirus lead to sweep travel restriction a...,100,travel
276,250891272,"‼️ Tomorrow at 8:45 am., I will be interview b...",‼ tomorrow at 845 interview 10 covid19 virus i...,100,large
366,3354387689,Cities that have stay-at-home orders should al...,city stayathome order also schedule allow the ...,100,nonessential
411,285182559,Inoculum dose is very important. Small exposur...,inoculum dose important small exposure v large...,100,large
...,...,...,...,...,...
15898,950335999,There’s a mandated stay at home order in my ci...,mandate stay at home order city influx covid ...,100,travel
15978,20119986,REMEMBER: As recently as THIS MONTH the Orange...,remember recently month the orange idiot lamen...,100,unnecessary
16050,721484105850822656,Where was your congressperson when the #WuhanV...,your congressperson the wuhanvirus make to ame...,100,travel
16063,15769010,I keep reading about spring breakers coming ba...,keep reading spring breaker come back infect r...,100,travel


In [11]:
#interim_sd[interim_sd['final_score'] == 100].to_csv('interim_sd.csv',index=False)