In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string
import sklearn

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/All Facets/final(Analysis)_h01-20200912-101538.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['stay at home' , 'do your part', 'Responsible', 
            'home', 'house', 'cancel', 'shutdown', 'postpone',
            'school closure', 'Closure', 'business closure',
            'suspension', 'quarantine', 'lockdown', 'social distance', 
            'social distancing', 'self quarantine', 'isolat', '6-feet',
            'distance', '#clubquarantine', '#quarantinelife', '#quarantineacitivites']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
interim_imp = data.apply(fuzzy_m, axis=1)

In [8]:
interim_imp

Unnamed: 0,id,user_id,extended_tweet_full_text,extended_tweet_full_text_duplicate,final,final_score,final_keyword_match
0,1.304786e+18,9.053900e+07,woke up to see if the justin bieber pandemic w...,woke up to see if the justin bieber pandemic w...,woke to see the justin bieber pandemic go back...,60,6feet
1,1.304786e+18,8.497239e+17,our intention is to make sure that evidence sc...,@TeresaCCarter2 “Our intention is to make sure...,intention to make sure evidence sciencebased ...,50,stay at home
2,1.304786e+18,1.293830e+18,for more information contact us mail follow ...,For More Information contact us. \nMail:- digi...,information contact u mail follow instagram gy...,62,distance
3,1.304786e+18,1.188902e+18,uae reports 1007 new covid19 cases highest sin...,"UAE reports 1,007 new Covid-19 cases, highest ...",uae report 1007 new covid19 case high since ou...,57,closure
4,1.304786e+18,2.273830e+08,trump officials interfered with cdc reports on...,Trump officials interfered with CDC reports on...,trump official interfere cdc report covid19 po...,60,social distance
...,...,...,...,...,...,...,...
6732,1.304427e+18,6.874206e+07,why did twitter suddenly reinstate could it ...,Why did Twitter suddenly reinstate @clif_high?...,twitter suddenly reinstate could science valid...,62,distance
6733,1.304671e+18,8.323244e+17,denna veckas covid19 veckorapport från folkhäl...,Denna veckas COVID-19 veckorapport från Folkhä...,denna veckas covid19 veckorapport från folkhäl...,60,suspension
6734,1.304768e+18,4.446656e+09,republicans defend trump after he admitted dow...,Republicans Defend Trump After He Admitted Dow...,republican defend trump admit downplay true th...,67,cancel
6735,1.301853e+18,3.914277e+08,the recession on the back of the governments h...,The recession on the back of the Government's ...,the recession the back the government handle c...,53,social distance


In [9]:
interim_imp['final_score'] = interim_imp['final_score'].astype(int)

In [10]:
interim_imp = interim_imp[interim_imp['final_score'] == 100]

In [11]:
interim_imp

Unnamed: 0,id,user_id,extended_tweet_full_text,extended_tweet_full_text_duplicate,final,final_score,final_keyword_match
9,1.304786e+18,5.825632e+07,coronavirus isolation rulebreakers in england ...,Coronavirus: isolation rule-breakers in Englan...,coronavirus isolation rulebreakers england may...,100,isolat
16,1.304786e+18,4.200751e+07,lockdown day 173 see my new daily photo on bl...,"""Lockdown Day 173"" https://t.co/MdPkYSfNo2 Se...",lockdown day 173 see new daily photo blipfoto,100,lockdown
21,1.304786e+18,8.185423e+07,an amesbury woman has been named a lockdown he...,An Amesbury woman has been named a 'lockdown h...,amesbury woman name lockdown hero to great wes...,100,lockdown
30,1.304786e+18,4.231834e+09,recommended read whats more the coronavirus i...,"Recommended read - What's more, the coronaviru...",recommend read whats the coronavirus sweep acr...,100,lockdown
33,1.304786e+18,1.296229e+18,truthfully i took it out earlier in quarantine...,truthfully I took it out earlier in quarantine...,truthfully take out earlier quarantine consid...,100,quarantine
...,...,...,...,...,...,...,...
6677,1.304546e+18,2.648717e+07,politicizing the pandemic slams the left’s ca...,Politicizing the Pandemic: @AlexBerenson slams...,politicize the pandemic slam the left call se...,100,lockdown
6685,1.263393e+18,7.940104e+08,the communities who are going to be scared aft...,The communities who are going to be scared aft...,the community go to scar lockdown lift the one...,100,lockdown
6710,1.304685e+18,3.178464e+09,tanoa had the cops called on them after being ...,Tanoa had the cops called on them after being ...,tanoa the cop call give the green light to ope...,100,shutdown
6718,1.304026e+18,1.427513e+07,dont jump to conclusions about the oxford vacc...,Don't jump to conclusions about the Oxford vac...,dont jump to conclusion the oxford vaccine tri...,100,suspension


In [13]:
interim_imp[interim_imp['final_score'] == 100].to_csv('interim_imp.csv',index=False)