In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Graphs/final_processed_april01.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['bored' , 'lonely', 'stress', 
            'anxiety', 'scared', 'worry', 'end', 'cabin fever',
            '#sideeffectsofquarantinelife', 'tissue paper', 'toilet paper']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

# Testing

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
interim_ne = data.apply(fuzzy_m, axis=1)

In [8]:
interim_ne

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
0,28433993,As Chicago households fill out 2020 census dur...,chicago household fill out 2020 census coronav...,75,bore
1,101043870,Most of the press corps is very busy covering ...,the press corp busy cover the really important...,50,bore
2,16475267,@realDonaldTrump Mar 10–he was very specific: ...,mar 10–he specific google 1700 engineer work ...,100,end
3,16475267,"@realDonaldTrump , You were caustic &amp; sarc...",caustic amp sarcastic the rollout come even f...,75,scar
4,345397708,"Florida nears 8,000 coronavirus cases, as stat...",florida nears 8000 coronavirus case state repo...,67,end
...,...,...,...,...,...
16185,751644712570847236,If in 1938 Turkey’s government could produce a...,1938 turkey government could produce health m...,50,bore
16186,286628737,Sandy Medford was a friend of my family for de...,sandy medford friend family decade suspect hea...,100,end
16187,361010205,One month ago Trump claimed the number of Amer...,one month ago trump claimed the number america...,67,end
16188,481389842,Yes. This. Especially if a substantial proport...,yes especially substantial proportion workforc...,67,end


In [9]:
interim_ne['final_score'] = interim_ne['final_score'].astype(int)

In [10]:
interim_ne[interim_ne['final_score'] == 100]

Unnamed: 0,user_id,text_duplicate,final,final_score,final_keyword_match
2,16475267,@realDonaldTrump Mar 10–he was very specific: ...,mar 10–he specific google 1700 engineer work ...,100,end
10,394499101,AOC slams Cuomo's decision to suspend New York...,aoc slam cuomos decision to suspend new yorker...,100,end
11,394499101,AOC slams Cuomo's decision to suspend New York...,aoc slam cuomos decision to suspend new yorker...,100,end
19,95027069,How are we at #jpcarpet spending our #quaranti...,at jpcarpet spending quarantine building close...,100,end
42,15201225,"NEW BLOG! ""10 Self-Care Tips during a Stay-At-...",new blog 10 selfcare tip stayathome order frie...,100,end
...,...,...,...,...,...
16116,2233488978,"“For weeks, experts and advocates have been ra...",week expert advocate raise alarm the coronavir...,100,stress
16137,867183369976766465,What?! The People of @Australia @COVID_Austral...,the people allow to defend amp family from ph...,100,end
16156,396214595,Trump sent workers without protection or train...,trump sent worker without protection training ...,100,end
16168,566883318,Helps me to see it this way. #Mitchplease\nCOV...,help to see way mitchplease covid19 cdc warn j...,100,end


In [11]:
#interim_ne[interim_ne['final_score'] == 100].to_csv('interim_ne.csv',index=False)