===========================================


Title: 5.2 Exercises


Author: Chad Wood


Date: 24 Jan 2022


Modified By: Chad Wood


Description: This program demonstrates cleaning, normalizing, and comparing text data with various NLP techniques.


=========================================== 

In [27]:
import normalizer as norm
import pandas as pd

tweets = pd.read_csv('data/twitter_sample.csv')

### 1. Using the file, twitter_sample.csv file, which can be found in the "data" directory in the Week 5 GitHub repository: Clean the “Tweet Content” column by removing non-text data and stop words.

In [51]:
tweets_content = norm.Normalizer(tweets['Tweet Content'])
tweets['Tweet Content'] = tweets_content.normalize(strip_html=True, remove_special_chars=True, 
                                                   remove_digits=True, remove_stopwords=True)

tweets[['Tweet Content', 'Tweet Type']].head()

Unnamed: 0,Tweet Content,Tweet Type
0,Pets change lives become part families Thats m...,Tweet
1,Another spot morethanmedicine bus bristol week...,Tweet
2,great team HealthSourceOH Local morethanmedici...,ReTweet
3,great team HealthSourceOH Local morethanmedici...,ReTweet
4,great team HealthSourceOH Local morethanmedici...,ReTweet


### 2. Filtering only tweets (not re-tweets) use your class from part one of this exercise to build BOW and TF-IDF Vectorizer representations of the text; print your results. Don't over-think this, leverage what the author does in the text.

In [98]:
from sklearn.feature_extraction.text import CountVectorizer

# Filters tweets from retweets
tweets_filtered = tweets['Tweet Content'].loc[tweets['Tweet Type'] == 'Tweet']

# Gets bag of words features
cv = CountVectorizer(min_df=0., max_df=1.)
cv_X = cv.fit_transform(tweets_filtered)
cv_names = cv.get_feature_names()

pd.DataFrame(cv_X.toarray(), columns=cv_names).head()

Unnamed: 0,academic,acep,act,activity,advice,advocating,affected,aflac,ag_em,againthe,...,yesterday,yet,youll,youre,zoetis,zones,zoonosen,zoonoses,zu,zwierzt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer

# Initializes transformer instance
tfid = TfidfTransformer(norm='l2', use_idf=True)
# Fit to data, followed by transform count matrix
tfid_X = tfid.fit_transform(cv_X)

pd.DataFrame(np.round(tfid_X.toarray(), 2), columns=cv_names).head()

Unnamed: 0,academic,acep,act,activity,advice,advocating,affected,aflac,ag_em,againthe,...,yesterday,yet,youll,youre,zoetis,zones,zoonosen,zoonoses,zu,zwierzt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Find one or more documents (each tweet is a document) that are similar to each other using Cosine Similarity; print your results. (NOTE: the lower the Cosine Similarity, the more likely the documents are similar.

In [159]:
from sklearn.metrics.pairwise import cosine_similarity

# Builds cosine_similarity dataframe
similarity_X = cosine_similarity(tfid_X)
similarity_df = pd.DataFrame(similarity_X)

In [158]:
# Removes instances where index meets column at same document (resulting in ~1)
similarity_df[similarity_df > 0.999] = np.NaN

# Creates ranked df containing {column: index} for highest value in rows
ranked_simularity = similarity_df.idxmax().reset_index()
ranked_simularity.columns=['Doc_x', 'Doc_y']

# Adds values in as new column for convenience
ranked_simularity['value'] = ranked_simularity.apply(lambda x: similarity_df.loc[x['Doc_x'], x['Doc_y']], axis=1)

# Adds the matching tweets
ranked_simularity['tweet_x'] = ranked_simularity.apply(lambda x: tweets['Tweet Content'].loc[x['Doc_x']], axis=1)
ranked_simularity['tweet_y'] = ranked_simularity.apply(lambda x: tweets['Tweet Content'].loc[x['Doc_y']], axis=1)

ranked_simularity.sort_values(by=['value'], ascending=False)

Unnamed: 0,Doc_x,Doc_y,value,tweet_x,tweet_y
76,76,77,0.671577,world lost incredible educator Mrs Ventura tau...,world lost incredible educator Mrs Ventura tau...
77,77,76,0.671577,world lost incredible educator Mrs Ventura tau...,world lost incredible educator Mrs Ventura tau...
18,18,24,0.609183,Czy wiesz e bez lekw dla zwierzt potrzeba wice...,study showed without animalmedicines would nee...
24,24,18,0.609183,study showed without animalmedicines would nee...,Czy wiesz e bez lekw dla zwierzt potrzeba wice...
26,26,48,0.532554,delivered th Special Aflac Duck yesterday prou...,Episode Life Residency Handsome Discussion joi...
...,...,...,...,...,...
73,73,46,0.077824,Impfen schtzt Der beste Weg zu mehr Tiergesund...,Episode Life Residency Handsome Discussion joi...
9,9,13,0.069634,FIX Want preview AG_EM story check back Monday...,Medtronic Engage patients even smile morethanm...
25,25,26,0.064783,Genetic information provided scant incremental...,delivered th Special Aflac Duck yesterday prou...
4,4,22,0.009435,great team HealthSourceOH Local morethanmedici...,nearly years PennyBrohnUK link standard medica...
