In [1]:
import nltk
from nltk.corpus import webtext
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import twitter_samples

import spacy

import numpy as np
from numpy.linalg import norm

import pandas as pd

import string

import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

import pickle

In [2]:
# Task 2: Compute similar twitter tweets by using TF-IDF
with open("tweets_lem.data", "rb") as filehandle:
    tweets_lem = pickle.load(filehandle)
tweets_raw = twitter_samples.strings("tweets.20150430-223406.json")

cv = CountVectorizer(analyzer = lambda x:x)
word_count_vector = cv.fit_transform(tweets_lem)
feature_names = cv.get_feature_names_out()
print(word_count_vector.shape)

show = 9
# get count vector for one of the documents
show_doc_vector = word_count_vector[show]

# print the count
df = pd.DataFrame(show_doc_vector.T.todense(), index=feature_names, columns=["count"])
print(tweets_lem[show])
print(df.sort_values(by=["count"],ascending=False)[:10])

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

# print the lowest and highest idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(), columns=["idf"])
print(df_idf.sort_values(by=['idf'])[:10])
print(df_idf.sort_values(by=['idf'])[-10:])

# tf-idf scores
tf_idf_vector=tfidf_transformer.transform(word_count_vector)

show = 0
# get tfidf vector for first document
show_doc_vector=tf_idf_vector[show]

#print the scores
df = pd.DataFrame(show_doc_vector.T.todense(), index=feature_names, columns=["tfidf"])
print(tweets_lem[show])
print(df.sort_values(by=["tfidf"],ascending=False)[:20])

similarities = cosine_similarity(tf_idf_vector)
index = 1000
df = pd.DataFrame(similarities[index], index=tweets_raw, columns=["similarity"])
df['#']=np.arange(0, len(df))
df.sort_values(by=["similarity"],ascending=False)[:20]

(20000, 19259)
['lolz', 'trickle', 'wealth', 'never', 'trickling', 'past', 'wallet', 'greed', 'always', 'win', '$', '$', '$', 'greedy', 'https://t.co/x7deopbs97']
           count
$              3
wallet         1
greed          1
win            1
trickle        1
trickling      1
wealth         1
always         1
lolz           1
greedy         1
               idf
tory      2.254790
miliband  2.332614
snp       2.642269
ed        2.998096
#bbcqt    3.035283
labour    3.093590
cameron   3.124071
farage    3.413008
david     3.489567
ukip      3.493796
                      idf
econonomically   10.21039
econs            10.21039
eden             10.21039
edgy             10.21039
edit             10.21039
editor-in-chief  10.21039
editorship       10.21039
edm's            10.21039
eejits           10.21039
󾌡                10.21039
['@kirkkus', 'indirect', 'cost', 'uk', 'eu', 'estimated', 'costing', 'britain', '£', '170', 'billion', 'per', 'year', '#betteroffout', '#ukip']
           

Unnamed: 0,similarity,#
"RT @SkyNews: THE TIMES FRONT PAGE: ""Miliband savaged for ‘lies’ over spending"" #skypapers http://t.co/mOBNQwE1oM",1.0,19186
"RT @SkyNews: THE TIMES FRONT PAGE: ""Miliband savaged for ‘lies’ over spending"" #skypapers http://t.co/mOBNQwE1oM",1.0,2069
"RT @SkyNews: THE TIMES FRONT PAGE: ""Miliband savaged for ‘lies’ over spending"" #skypapers http://t.co/mOBNQwE1oM",1.0,10519
"RT @SkyNews: THE TIMES FRONT PAGE: ""Miliband savaged for ‘lies’ over spending"" #skypapers http://t.co/mOBNQwE1oM",1.0,1000
"RT @SkyNews: THE TIMES FRONT PAGE: ""Miliband savaged for ‘lies’ over spending"" #skypapers http://t.co/mOBNQwE1oM",1.0,346
"RT @SkyNews: THE TIMES FRONT PAGE: ""Miliband savaged for ‘lies’ over spending"" #skypapers http://t.co/mOBNQwE1oM",1.0,1688
"RT @SkyNews: THE TIMES FRONT PAGE: ""Miliband savaged for ‘lies’ over spending"" #skypapers http://t.co/mOBNQwE1oM",1.0,15432
Tomorrow's front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday http://t.co/LKNk3wIdtg,0.537929,562
Tomorrow's front page: Miliband savaged for ‘lies’ over spending #tomorrowspaperstoday http://t.co/NovVm5V6uG,0.521048,5611
RT @suttonnick: Friday's Times front page:\nMiliband savaged for ‘lies’ over spending\n#tomorrowspaperstoday #bbcpapers http://t.co/8mqIg3z7MO,0.511705,1555
