1. Text Normalization

In [69]:
import nltk
import numpy as np
import pandas as pd

In [70]:
df = pd.read_csv("elonmusk_tweets.csv")

In [71]:
df.shape

(2819, 3)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2819 entries, 0 to 2818
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          2819 non-null   int64 
 1   created_at  2819 non-null   object
 2   text        2819 non-null   object
dtypes: int64(1), object(2)
memory usage: 66.2+ KB


In [73]:
df.describe()

Unnamed: 0,id
count,2819.0
mean,5.804848e+17
std,2.186404e+17
min,15434730000.0
25%,3.506818e+17
50%,6.569719e+17
75%,7.704732e+17
max,8.496369e+17


In [74]:
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ThisPc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [75]:
def normalize(document):

    text = "".join([ch for ch in document if ch not in string.punctuation])
    
    tokens = word_tokenize(text)
    
    stemmer = PorterStemmer()
    ret = " ".join([stemmer.stem(word.lower()) for word in tokens])
    
    return ret

original_documents = [x.strip() for x in df['text']]
documents = [normalize(d).split() for d in original_documents]
documents[0]

['band', 'so', 'the', 'robot', 'spare', 'human', 'httpstcov7jujqwfcv']

In [76]:
import re

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO-]? # Nose (optional)
        [D)\]\(\]/pP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>',  # HTML tags
    r'(?:@[\w_]+)',  # @-mentions
    r"(?:\#+[\w_]+[\w'_-]*[\w_]+)",  # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-f][0-9a-f]))+',  # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)',  # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)'  # anything else
]

tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)


def tokenize(s):
    return tokens_re.findall(s)


def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens


original_documents = [x.strip() for x in df['text']]
documents = [preprocess(d) for d in original_documents]

documents[1]  # Print the preprocessed document at index 1

['b',
 '"',
 '@ForIn2020',
 '@waltmossberg',
 '@mims',
 '@defcon_5',
 'Exactly',
 '.',
 'Tesla',
 'is',
 'absurdly',
 'overvalued',
 'if',
 'based',
 'on',
 'the',
 'past',
 ',',
 'but',
 "that's",
 'irr',
 '\\',
 'xe2',
 '\\',
 'x80',
 '\\',
 'xa6',
 'https://t.co/qQcTqkzgMl',
 '"']

2. Implement TF-IDF

In [77]:
import pandas as pd
from collections import Counter
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ThisPc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [88]:
def tf(vocabulary, documents):
    matrix = [0] * len(documents)
    for i, document in enumerate(documents):
        counts = Counter(document)
        matrix[i] = [0] * len(vocabulary)
        for j, term in enumerate(vocabulary):
            matrix[i][j] = counts[term]
    return matrix

tf = tf(vocabulary, documents)
np.array(vocabulary)[np.where(np.array(tf[1]) > 0)], np.array(tf[1])[np.where(np.array(tf[1]) > 0)]

(array(['.', 'b', '\\', '"', ',', 'xe2', 'x80', 'Tesla', 'xa6', "that's",
        'Exactly'], dtype='<U16'),
 array([1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1]))

In [89]:
def idf(vocabulary, documents):
    idf_values = {}
    num_documents = len(documents)
    for term in vocabulary:
        count = sum(term in document for document in documents)
        idf_values[term] = math.log(num_documents / count, 2)
    return idf_values
idf_values = idf(vocabulary, documents)
[idf_values[key] for key in vocabulary[:5]]

[0.959130577668125,
 0.7493007890060756,
 1.1177820471225408,
 3.0990239888351803,
 2.384152165519591]

3.Compare the results with the reference implementation of scikit-learn library.

In [87]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english', max_features=500)

features = tfidf.fit(original_documents)
corpus_tf_idf = tfidf.transform(original_documents) 

sum_words = corpus_tf_idf.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in tfidf.vocabulary_.items()]
print(sorted(words_freq, key = lambda x: x[1], reverse=True)[:5])
print('testla', corpus_tf_idf[1, features.vocabulary_['tesla']])

[('http', 163.54366542841234), ('https', 151.85039944652075), ('rt', 112.61998731390989), ('tesla', 95.96401470715628), ('xe2', 88.20944486346477)]
testla 0.3495243100660956


In [81]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel
tfidf = TfidfVectorizer (analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english', max_features=500)

features=tfidf.fit(original_documents)
corpus_tf_idf = tfidf.transform (original_documents)

sum_words = corpus_tf_idf.sum(axis=0)
words_freq = [(word, sum_words [0, idx]) for word, idx in tfidf.vocabulary_.items()]
print (sorted (words_freq, key = lambda x: x[1], reverse=True)[:5])
print ('testla', corpus_tf_idf [1, features.vocabulary_['tesla']])

[('http', 163.54366542841234), ('https', 151.85039944652075), ('rt', 112.61998731390989), ('tesla', 95.96401470715628), ('xe2', 88.20944486346477)]
testla 0.3495243100660956


4. Apply TF-IDF for information retrieval

In [92]:
new_features = tfidf.transform([query])

cosine_similarities = linear_kernel(new_features, corpus_tf_idf).flatten()
related_docs_indices = cosine_similarities.argsort()[::-1]

topk = 5
print('Top-{0} documents'.format(topk))
for i in range(topk):
    print(i, original_documents[related_docs_indices[i]])

Top-5 documents
0 b'@ashwin7002 @NASA @faa @AFPAA We have not ruled that out.'
1 b"SpaceX could not do this without NASA. Can't express enough appreciation. https://t.co/uQpI60zAV7"
2 b'@NASA launched a rocket into the northern lights http://t.co/tR2cSeMV'
3 b'Whatever happens today, we could not have done it without @NASA, but errors are ours alone and me most of all.'
4 b'RT @NASA: Updated @SpaceX #Dragon #ISS rendezvous times: NASA TV coverage begins Sunday at 3:30amET: http://t.co/qrm0Dz4jPE. Grapple at  ...'
