In [1]:
import os
import sqlite3
import pandas as pd
import numpy as np
import nltk
import pickle 
from sklearn.feature_extraction.text import TfidfVectorizer # Tfidf implementation
import warnings
warnings.filterwarnings('ignore')

# Loading Dataset

In [2]:
if os.path.isfile('final.sqlite'):
    conn = sqlite3.connect('final.sqlite')
    final = pd.read_sql_query('select * from reviews', conn)
    conn.close()
else:
    print('Please run Text Preprocessing code file')

# TF-IDF

In [3]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1, 2))
final_tf_idf = tf_idf_vect.fit_transform(final['CleanedText'].values)
# final_tf_idf is a sparse matrix with row=sentence, col=word and cell_val=tfidf
print("The type of text Tfidf vectorizer : ", type(final_tf_idf))
print("The shape of text Tfidf vectorizer : ", final_tf_idf.get_shape())
print("Number of unique words including both unigrams and bigrams: ", final_tf_idf.get_shape()[1])

The type of text Tfidf vectorizer :  <class 'scipy.sparse.csr.csr_matrix'>
The shape of text Tfidf vectorizer :  (87773, 1126245)
Number of unique words including both unigrams and bigrams:  1126245


In [4]:
features = tf_idf_vect.get_feature_names()
print("some sample features(unique words in the corpus)\n", features[100:110])

some sample features(unique words in the corpus)
 ['abandon yet', 'abandon zico', 'abat', 'abat expens', 'abat steep', 'abb', 'abb perform', 'abbay', 'abbay flavigni', 'abberlin']


In [5]:
# printing top n tfidf features
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [6]:
toptfidf = top_tfidf_feats(final_tf_idf[1,:].toarray()[0], features, 25)

In [7]:
toptfidf

Unnamed: 0,feature,tfidf
0,jumbo,0.33168
1,vendor intend,0.221679
2,error vendor,0.221679
3,intend repres,0.221679
4,sure error,0.221679
5,jumbo salt,0.221679
6,label jumbo,0.221679
7,unsalt sure,0.221679
8,size unsalt,0.221679
9,product jumbo,0.213989
