In [98]:
# https://github.com/parrt/msds692/blob/master/notes/tfidf.ipynb
import pandas as pd
import numpy as np


def n_gram(word_list, n):

	if not word_list or not n:
		return None

	results = []

	for idx in range(0, len(word_list) - n + 1):
		results.append(word_list[idx:idx+n])
	return results

In [99]:
d1 = "the new new york times" # (repeated new)
d2 = "the new york post"
d3 = "the los angeles times"

docstrs = [d1,d2,d3]
docs = [s.split() for s in docstrs]
N = len(docs)
docs

[['the', 'new', 'new', 'york', 'times'],
 ['the', 'new', 'york', 'post'],
 ['the', 'los', 'angeles', 'times']]

# Get all unique tokens

In [100]:
def get_all_tokens(docs):
	unique_tokens = set()
	for sentence in docs:
		for word in sentence:
			unique_tokens.add(word)
	return sorted(unique_tokens)

In [101]:
unique_tokens = get_all_tokens(docs=docs)

# Get bag of matrix

In [102]:
def bow(unique_tokens, docs):

	if not unique_tokens or not docs:
		return None
	
	bow_matrix = np.zeros( (len(docs), len(unique_tokens)) )
	for row_idx, sentence in enumerate(docs):
		for token in sentence:
			bow_matrix[row_idx, unique_tokens.index(token)] += 1
	return bow_matrix

In [103]:
bow_matrix = bow(unique_tokens=unique_tokens, docs=docs)
doc_label = ["doc" + str(i+1) for i in range(len(docs))]
tf = pd.DataFrame(data=bow_matrix, columns=unique_tokens, index = doc_label).T
pd.DataFrame(data=bow_matrix, columns=unique_tokens, index = doc_label)


Unnamed: 0,angeles,los,new,post,the,times,york
doc1,0.0,0.0,2.0,0.0,1.0,1.0,1.0
doc2,0.0,0.0,1.0,1.0,1.0,0.0,1.0
doc3,1.0,1.0,0.0,0.0,1.0,1.0,0.0


In [104]:
tf

Unnamed: 0,doc1,doc2,doc3
angeles,0.0,0.0,1.0
los,0.0,0.0,1.0
new,2.0,1.0,0.0
post,0.0,1.0,0.0
the,1.0,1.0,1.0
times,1.0,0.0,1.0
york,1.0,1.0,0.0


In [105]:
tf["doc1"] / tf.sum(axis = 0)["doc1"] # get term frequency

angeles    0.0
los        0.0
new        0.4
post       0.0
the        0.2
times      0.2
york       0.2
Name: doc1, dtype: float64

# Get all term frequency

In [106]:
tf['d1_tf'] = tf["doc1"] / tf.sum(axis = 0)["doc1"] 
tf['d2_tf'] = tf["doc2"] / tf.sum(axis = 0)["doc2"] 
tf['d3_tf'] = tf["doc3"] / tf.sum(axis = 0)["doc3"] 
tf

Unnamed: 0,doc1,doc2,doc3,d1_tf,d2_tf,d3_tf
angeles,0.0,0.0,1.0,0.0,0.0,0.25
los,0.0,0.0,1.0,0.0,0.0,0.25
new,2.0,1.0,0.0,0.4,0.25,0.0
post,0.0,1.0,0.0,0.0,0.25,0.0
the,1.0,1.0,1.0,0.2,0.25,0.25
times,1.0,0.0,1.0,0.2,0.0,0.25
york,1.0,1.0,0.0,0.2,0.25,0.0


# Get doc frequency

In [107]:
tf_dc = pd.DataFrame(data=bow_matrix, columns=unique_tokens, index = doc_label).T
tf_dc

Unnamed: 0,doc1,doc2,doc3
angeles,0.0,0.0,1.0
los,0.0,0.0,1.0
new,2.0,1.0,0.0
post,0.0,1.0,0.0
the,1.0,1.0,1.0
times,1.0,0.0,1.0
york,1.0,1.0,0.0


In [108]:
tf_dc = (tf_dc >= 1) * 1
tf_dc

Unnamed: 0,doc1,doc2,doc3
angeles,0,0,1
los,0,0,1
new,1,1,0
post,0,1,0
the,1,1,1
times,1,0,1
york,1,1,0


In [109]:
df = tf_dc.sum(axis = 1) / len(docs)
df = pd.DataFrame(df, columns=['df'])
df

Unnamed: 0,df
angeles,0.333333
los,0.333333
new,0.666667
post,0.333333
the,1.0
times,0.666667
york,0.666667


In [110]:
tf

Unnamed: 0,doc1,doc2,doc3,d1_tf,d2_tf,d3_tf
angeles,0.0,0.0,1.0,0.0,0.0,0.25
los,0.0,0.0,1.0,0.0,0.0,0.25
new,2.0,1.0,0.0,0.4,0.25,0.0
post,0.0,1.0,0.0,0.0,0.25,0.0
the,1.0,1.0,1.0,0.2,0.25,0.25
times,1.0,0.0,1.0,0.2,0.0,0.25
york,1.0,1.0,0.0,0.2,0.25,0.0


In [111]:
tf1 = pd.concat([tf, df], axis=1)
tf1

Unnamed: 0,doc1,doc2,doc3,d1_tf,d2_tf,d3_tf,df
angeles,0.0,0.0,1.0,0.0,0.0,0.25,0.333333
los,0.0,0.0,1.0,0.0,0.0,0.25,0.333333
new,2.0,1.0,0.0,0.4,0.25,0.0,0.666667
post,0.0,1.0,0.0,0.0,0.25,0.0,0.333333
the,1.0,1.0,1.0,0.2,0.25,0.25,1.0
times,1.0,0.0,1.0,0.2,0.0,0.25,0.666667
york,1.0,1.0,0.0,0.2,0.25,0.0,0.666667


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) #fit the vectorizer to synopses

print(tfidf_matrix.shape)