In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import string

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) - set(['at', 'do', 'your', 'from', 'to', 'out', 'no', 'the'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/manikya_varshney/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = '/home/manikya_varshney/Documents/Python/Yale/final_processed_h01-20200912-101538.csv'
data = pd.read_csv(path, low_memory=False)

In [3]:
keywords = ['school from home' , 'learn', 'remote', 'school food service', 
            'online shopping', 'online purchase', 'online church', 'delivery',
            'drive thru', 'to go', 'take out', 'Tiktok', 'Netflix', 'telework', 
            'zoom', 'telehealth', 'telemedicine', 'work from home', 'wfh',
            'working at home', 'working remotely', 'online meeting']

##### 1. Casing (Upper or lower case)
##### 2. Noise Removal (Removal of punctuation, white spaces, special characters, HTML tags)
##### 3. Tokenization (Tweets to tokens i.e. words seprated by spaces)
##### 4. Stopword Removal
##### 5. Text Normalization (Stemming and Lemmatization)

In [4]:
#Convert to lower
for i in range(len(keywords)): 
    keywords[i] = keywords[i].lower()

#Remove punctuations   
for i in range(len(keywords)): 
    keywords[i] = keywords[i].translate(str.maketrans('','',string.punctuation))

#More cleaning
for i in range(len(keywords)): 
    keywords[i] = keywords[i].replace('/[^a-zA-Z0-9 ]/g', '').replace('\n',' ').strip('“').strip('“').strip('’').lstrip(' ').rstrip(' ')

#Tokenize
#keywords_tokens = [sub.split() for sub in keywords] 

#Remove stop words
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in sentence.split():
            if word not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

keywords_filtered=remove_stopwords(keywords)

#Stemming
ps = PorterStemmer()
keywords_stem = [[ps.stem(word) for word in sentence.split(" ")] for sentence in keywords_filtered]
keywords_stem = [" ".join(sentence) for sentence in keywords_stem]

#Lemmetizing

#POSTags
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()
keywords_lem = [[lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in sentence.split(" ")] for sentence in keywords_filtered]
keywords_final = [" ".join(sentence) for sentence in keywords_lem]

## Fuzzy Matching

In [5]:
data['final'] = data['final'].apply(str)
choices = data['final'].tolist()

# Testing 

In [6]:
 def fuzzy_m(row):
        keyword_match, score = process.extractOne(row['final'], keywords_final, scorer = fuzz.partial_ratio)
        row['final_score'] = score
        row['final_keyword_match'] = keyword_match
        return row

In [7]:
interim_ada = data.apply(fuzzy_m, axis=1)

In [8]:
interim_ada

Unnamed: 0,id,user_id,extended_tweet_full_text,extended_tweet_full_text_duplicate,final,final_score,final_keyword_match
0,1.304786e+18,9.053900e+07,woke up to see if the justin bieber pandemic w...,woke up to see if the justin bieber pandemic w...,woke to see the justin bieber pandemic go back...,62,take out
1,1.304786e+18,8.497239e+17,our intention is to make sure that evidence sc...,@TeresaCCarter2 “Our intention is to make sure...,intention to make sure evidence sciencebased ...,62,take out
2,1.304786e+18,1.293830e+18,for more information contact us mail follow ...,For More Information contact us. \nMail:- digi...,information contact u mail follow instagram gy...,60,to go
3,1.304786e+18,1.188902e+18,uae reports 1007 new covid19 cases highest sin...,"UAE reports 1,007 new Covid-19 cases, highest ...",uae report 1007 new covid19 case high since ou...,67,remote
4,1.304786e+18,2.273830e+08,trump officials interfered with cdc reports on...,Trump officials interfered with CDC reports on...,trump official interfere cdc report covid19 po...,50,remote
...,...,...,...,...,...,...,...
6732,1.304427e+18,6.874206e+07,why did twitter suddenly reinstate could it ...,Why did Twitter suddenly reinstate @clif_high?...,twitter suddenly reinstate could science valid...,75,take out
6733,1.304671e+18,8.323244e+17,denna veckas covid19 veckorapport från folkhäl...,Denna veckas COVID-19 veckorapport från Folkhä...,denna veckas covid19 veckorapport från folkhäl...,50,remote
6734,1.304768e+18,4.446656e+09,republicans defend trump after he admitted dow...,Republicans Defend Trump After He Admitted Dow...,republican defend trump admit downplay true th...,60,learn
6735,1.301853e+18,3.914277e+08,the recession on the back of the governments h...,The recession on the back of the Government's ...,the recession the back the government handle c...,67,wfh


In [9]:
interim_ada['final_score'] = interim_ada['final_score'].astype(int)

In [10]:
interim_ada[interim_ada['final_score'] == 100]

Unnamed: 0,id,user_id,extended_tweet_full_text,extended_tweet_full_text_duplicate,final,final_score,final_keyword_match
33,1.304786e+18,1.296229e+18,truthfully i took it out earlier in quarantine...,truthfully I took it out earlier in quarantine...,truthfully take out earlier quarantine consid...,100,take out
47,1.304786e+18,1.494314e+09,please be kind to teachers in the next few wee...,Please be kind to teachers in the next few wee...,please kind to teacher the next week teacher t...,100,remote
70,1.304786e+18,1.061882e+18,closing next week had to move out for a few da...,"Closing next week, had to move out for a few d...",closing next week to move out day order quiet ...,100,zoom
71,1.304786e+18,8.289752e+17,due to the pandemic children are not going out...,@ScottPresler @MaryMargOlohan Due to the pande...,due to the pandemic child go outside pedophile...,100,netflix
110,1.304786e+18,3.015735e+08,i doubt it up here anyway there has been plent...,"@hired_merc I doubt it, up here anyway. There ...",doubt anyway plenty player test positive take ...,100,take out
...,...,...,...,...,...,...,...
6470,1.293707e+18,2.649893e+09,just learned my grandma has covid and is being...,Just learned my grandma has COVID and is being...,learn grandma covid transfer out assist care c...,100,learn
6493,1.304736e+18,9.888590e+08,first ever zoom meeting of with their beloved...,First ever zoom meeting of @AAPMumbai with the...,first ever zoom meeting beloved leader say aap...,100,zoom
6637,1.304362e+18,2.507388e+07,congratulations to jpmorgan chase for ordering...,Congratulations to JPMorgan Chase for ordering...,congratulation to jpmorgan chase order everyon...,100,work from home
6719,1.235309e+18,8.189486e+17,reviewing the coronavirus supplemental appropr...,Reviewing the coronavirus supplemental appropr...,review the coronavirus supplemental appropriat...,100,to go


In [11]:
interim_ada[interim_ada['final_score'] == 100].to_csv('interim_ada.csv',index=False)