In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from math import sqrt
import sys, os
from contextlib import contextmanager
import matplotlib as mpl
import seaborn as sns
import sklearn
import string
import nltk
import regex
# nltk.download('punkt')
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from nltk.corpus import stopwords
from collections import Counter
import math

In [17]:
data = pd.read_csv('data_lab3/elonmusk_tweets.csv')
data.head(5)

Unnamed: 0,id,created_at,text
0,849636868052275200,2017-04-05 14:56:29,b'And so the robots spared humanity ... https:...
1,848988730585096192,2017-04-03 20:01:01,"b""@ForIn2020 @waltmossberg @mims @defcon_5 Exa..."
2,848943072423497728,2017-04-03 16:59:35,"b'@waltmossberg @mims @defcon_5 Et tu, Walt?'"
3,848935705057280001,2017-04-03 16:30:19,b'Stormy weather in Shortville ...'
4,848416049573658624,2017-04-02 06:05:23,"b""@DaveLeeBBC @verge Coal is dying due to nat ..."


In [18]:
def normalize(document):
    # TODO: remove punctuation
    text = "".join([ch for ch in document if ch not in string.punctuation])
    
    # TODO: tokenize text
    tokens = word_tokenize(text)
    
    # TODO: Stemming
    stemmer = PorterStemmer()
    ret = " ".join([stemmer.stem(word.lower()) for word in tokens])
    return ret

def remove_binary_format(sentence):
    sentence = regex.sub('b', '', sentence)
    return sentence[:-1]

def unicodetoascii(text):

    TEXT = (text.
            replace('\\xe2\\x80\\x99', "'").
            replace('\\xc3\\xa9', 'e').
            replace('\\xe2\\x80\\x90', '-').
            replace('\\xe2\\x80\\x91', '-').
            replace('\\xe2\\x80\\x92', '-').
            replace('\\xe2\\x80\\x93', '-').
            replace('\\xe2\\x80\\x94', '-').
            replace('\\xe2\\x80\\x94', '-').
            replace('\\xe2\\x80\\x98', "'").
            replace('\\xe2\\x80\\x9b', "'").
            replace('\\xe2\\x80\\x9c', '"').
            replace('\\xe2\\x80\\x9c', '"').
            replace('\\xe2\\x80\\x9d', '"').
            replace('\\xe2\\x80\\x9e', '"').
            replace('\\xe2\\x80\\x9f', '"').
            replace('\\xe2\\x80\\xa6', '...').#
            replace('\\xe2\\x80\\xb2', "'").
            replace('\\xe2\\x80\\xb3', "'").
            replace('\\xe2\\x80\\xb4', "'").
            replace('\\xe2\\x80\\xb5', "'").
            replace('\\xe2\\x80\\xb6', "'").
            replace('\\xe2\\x80\\xb7', "'").
            replace('\\xe2\\x81\\xba', "+").
            replace('\\xe2\\x81\\xbb', "-").
            replace('\\xe2\\x81\\xbc', "=").
            replace('\\xe2\\x81\\xbd', "(").
            replace('\\xe2\\x81\\xbe', ")")
            )
    return TEXT

original_documents = [remove_binary_format(x) for x in data['text']]
original_documents = [unicodetoascii(x) for x in original_documents]
documents = [normalize(d).split() for d in original_documents]
documents[1]

['forin2020',
 'waltmosserg',
 'mim',
 'defcon5',
 'exactli',
 'tesla',
 'is',
 'asurdli',
 'overvalu',
 'if',
 'ase',
 'on',
 'the',
 'past',
 'ut',
 'that',
 'irr',
 'httpstcoqqctqkzgml']

In [19]:
# Flatten all the documents
flat_list = [word for doc in documents for word in doc]

# TODO: remove stop words from the vocabulary
words = [word for word in flat_list if word not in stopwords.words('english')]

# TODO: we take the 500 most common words only
counts = Counter(words)
vocabulary = counts.most_common(500)
print([x for x in vocabulary if x[0] == 'tesla'])
vocabulary = [x[0] for x in vocabulary]
assert len(vocabulary) == 500

# vocabulary.sort()
vocabulary[:10]

[('tesla', 345)]


['rt', 'tesla', 'e', 'model', 'spacex', 'thi', 'ut', 'amp', 'car', 'launch']

In [20]:
def idf(vocabulary, documents):
    # TODO: compute IDF, storing values in a dictionary
    idf = {}
    num_documents = len(documents)
    for i, term in enumerate(vocabulary):
        idf[term] = math.log(num_documents / sum(term in document for document in documents), 2)
    return idf

idf = idf(vocabulary, documents)
[idf[key] for key in vocabulary[:5]]

[2.53318980048808,
 3.0515768264327203,
 3.3735049213200825,
 3.5844508160054223,
 3.621763974473478]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfidf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 1), min_df = 1, stop_words = 'english', max_features = 500)

features = tfidf.fit(original_documents)
corpus_tf_idf = tfidf.transform(original_documents)

sum_words = corpus_tf_idf.sum(axis = 0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf.vocabulary_.items()]
print(sorted(words_freq, key = lambda x: x[1], reverse = True)[:5])
print('tesla', corpus_tf_idf[1, features.vocabulary_['tesla']])

[('http', 168.29994203008445), ('https', 155.52405485904552), ('rt', 118.97241600928525), ('tesla', 97.72727000593895), ('model', 75.84605089484148)]
tesla 0.39793416635892637


In [22]:
def vectorize(document, vocabulary, idf):
    vector = [0] * len(vocabulary)
    counts = Counter(document)
    for i, term in enumerate(vocabulary):
        vector[i] = idf[term] * counts[term]
    return vector

document_vectors = [vectorize(s, vocabulary, idf) for s in documents]

In [23]:
def cosine_similarity(v1, v2):
    # TODO: compute cosine similarity
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(v1)):
        x = v1[i]; y = v2[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
        result = 0
    else:
        result = sumxy/math.sqrt(sumxx * sumyy)
    return result

def search_vec(query, k, vocabulary, stemmer, document_vectors, original_documents):
    q = query.split()
    q = [stemmer.stem(w) for w in q]
    query_vector = vectorize(q, vocabulary, idf)
    
    # TODO: rank the documents by cosine similarity
    scores = [[cosine_similarity(query_vector, document_vectors[d]), d] for d in range(len(document_vectors))]
    scores.sort(key = lambda x: -x[0])
    
    print('Top-{0} documents'.format(k))
    for i in range(k):
        print(i, original_documents[scores[i][1]])
        
query = "tesla nasa"
stemmer = PorterStemmer()
search_vec(query, 5, vocabulary, stemmer, document_vectors, original_documents)

Top-5 documents
0 '@ashwin7002 @NASA @faa @AFPAA We have not ruled that out.
1 'RT @NASA: Updated @SpaceX #Dragon #ISS rendezvous times: NASA TV coverage egins Sunday at 3:30amET: http://t.co/qrm0Dz4jPE. Grapple at  ...
2 '@NASA launched a rocket into the northern lights http://t.co/tR2cSeMV
3 "Deeply appreciate @NASA's faith in @SpaceX. We will do whatever it takes to make NASA and the American people proud.
4 'RT @SpaceX: Success! Congrats @NASA on @MarsCuriosity!


In [24]:
query = "tesla nasa"
new_features = tfidf.transform([query])
cosine_similarities = linear_kernel(new_features, corpus_tf_idf).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]

topk = 5
print('Top-{0} documents'.format(topk))
for i in range(topk):
    print(i, original_documents[related_docs_indices[i]])

Top-5 documents
0 '@ashwin7002 @NASA @faa @AFPAA We have not ruled that out.
1 "SpaceX could not do this without NASA. Can't express enough appreciation. https://t.co/uQpI60zAV7
2 '@NASA launched a rocket into the northern lights http://t.co/tR2cSeMV
3 "Deeply appreciate @NASA's faith in @SpaceX. We will do whatever it takes to make NASA and the American people proud.
4 'RT @NASA: Updated @SpaceX #Dragon #ISS rendezvous times: NASA TV coverage egins Sunday at 3:30amET: http://t.co/qrm0Dz4jPE. Grapple at  ...
