In [38]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from langdetect import detect
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Question 2 - Clustering

In [2]:
dt = pd.read_csv('Reviews.csv', sep=",", usecols = [i for i in range(1,10)])

In [3]:
dt.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
dt.describe()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0
mean,1.743817,2.22881,4.183199,1296257000.0
std,7.636513,8.28974,1.310436,48043310.0
min,0.0,0.0,1.0,939340800.0
25%,0.0,0.0,4.0,1271290000.0
50%,0.0,1.0,5.0,1311120000.0
75%,2.0,2.0,5.0,1332720000.0
max,866.0,923.0,5.0,1351210000.0


## Clean up data

In [5]:
dt.isnull().sum()

ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

There is a very few number of null values. Let's fill them:

In [6]:
dt['ProfileName'].fillna('', inplace=True)
dt['Summary'].fillna('', inplace=True)

In [7]:
dt.isnull().sum()

ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

## Tokenize the text 

In [8]:
product_id = []
summary = []
reviews = []
new_df = pd.DataFrame()
for product, group in dt.groupby('ProductId'):
    product_id.append(product)
    reviews.append(" ".join(list(group['Text'])))
    summary.append(" ".join(list(map(str,group['Summary']))))
    
new_df['ProductId'] = product_id
new_df['reviews'] = reviews
new_df['summary'] = summary

In [11]:
def clean_text(text):
    stop_words = set(stopwords.words('english')) # obtain the stop words
    good_words = [] # save the correct words to consider like tokens
    tokenizer = RegexpTokenizer("[\w']+") # function to recognize the tokens
    words = tokenizer.tokenize(text) # tokenize the text 
    for word in words:
        # check if the word is lower and it isn't a stop word or a number
        if word.lower() not in stop_words and word.isalpha(): 
            word = PorterStemmer().stem(word) # use the stemmer function
            good_words.append(word.lower()) # insert the good token to lower case
        
    return good_words

In [12]:
%%time
# use the interest column..
new_df["reviews"] = [clean_text(x) for x in list(new_df["reviews"])]
new_df.head()

Wall time: 8min 13s


Unnamed: 0,ProductId,reviews,summary
0,0006641040,"[day, person, say, chicken, soup, probabl, go,...",Read it once. Read it twice. Reading Chicken S...
1,141278509X,"[product, archer, farm, best, drink, mix, ever...",The best drink mix
2,2734888454,"[dog, love, chicken, product, china, wont, buy...",made in china Dog Lover Delites
3,2841233731,"[book, easi, read, ingredi, avail, store, unli...",Great recipe book for my babycook
4,7310172001,"[product, health, snack, pup, made, beef, live...",very good Dogs Love These! Fast shipment Dogs ...


In [37]:
new_df.head()

Unnamed: 0,ProductId,reviews,summary
0,0006641040,"[day, person, say, chicken, soup, probabl, go,...",Read it once. Read it twice. Reading Chicken S...
1,141278509X,"[product, archer, farm, best, drink, mix, ever...",The best drink mix
2,2734888454,"[dog, love, chicken, product, china, wont, buy...",made in china Dog Lover Delites
3,2841233731,"[book, easi, read, ingredi, avail, store, unli...",Great recipe book for my babycook
4,7310172001,"[product, health, snack, pup, made, beef, live...",very good Dogs Love These! Fast shipment Dogs ...


In [54]:
%%time

tfidf = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x, min_df = 0.007)
tfidf_matrix = tfidf.fit_transform(new_df["reviews"])

Wall time: 8.08 s


In [55]:
%%time
len(tfidf.vocabulary_)

Wall time: 0 ns


2467

In [57]:
%%time
dn = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix)

Wall time: 183 ms


In [34]:
dn

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6708,6709,6710,6711,6712,6713,6714,6715,6716,6717
0,0.0,0.0,0.0,0.0,0.0,0.012593,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.006539,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002824,0.003352,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74253,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
74254,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
74255,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
74256,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
svd = TruncatedSVD(n_components=400)
svd.fit(dn)
print(svd.explained_variance_ratio_)
print(svd.explained_variance_ratio_.sum()*100)

[0.01297698 0.02538535 0.01840879 0.01474235 0.0111978  0.00991263
 0.00918543 0.00734789 0.00659676 0.00627754 0.00587669 0.00568613
 0.00510507 0.00481932 0.00457349 0.00451413 0.00435478 0.00428394
 0.00406694 0.00393502 0.00392526 0.00377064 0.00372454 0.00369229
 0.00358547 0.00351555 0.00335953 0.00329535 0.00324139 0.00321252
 0.00314822 0.00310565 0.00302313 0.0029435  0.00290514 0.00286603
 0.00274217 0.00273052 0.00265358 0.00263868 0.00257105 0.00255888
 0.0025204  0.00250617 0.00247551 0.00244224 0.00240432 0.00239653
 0.0023504  0.00231252 0.00230032 0.00227489 0.00223199 0.00222799
 0.00220319 0.00216437 0.00212646 0.00211184 0.00208605 0.00206654
 0.00204797 0.00203442 0.00201499 0.00199216 0.00197856 0.00192888
 0.00192321 0.00190998 0.00189695 0.00188058 0.00185895 0.00182396
 0.00180933 0.00179193 0.00178873 0.00176936 0.00176677 0.00176136
 0.00174323 0.00172949 0.00171345 0.00170422 0.0016754  0.00167091
 0.00166747 0.00165873 0.00164248 0.00163634 0.00162241 0.0016

## One long.. solution

In [None]:
dictionary = tfidf.vocabulary_
dictionary = dict(zip(dictionary.values(), dictionary.keys()))
dictionary

In [77]:
%%time

tfidf_matrix = tfidf_matrix.tocoo()
values = list(tfidf_matrix.data)
col_names = list(tfidf_matrix.col)

df = pd.DataFrame(0.000, index=np.arange(len(dt)), columns = [dictionary[x] for x in dictionary.keys()])
df

Wall time: 2.32 s


Unnamed: 0,bought,sever,vital,can,dog,food,product,found,good,qualiti,...,sunfood,tablesalt,unquestion,biofuel,ike,tempertur,wasten,cornstart,suppermarket,verstil
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
%%time
values = list(tfidf_matrix.data)
col_names = list(tfidf_matrix.col)
doc = list(tfidf_matrix.row)
list_of_list = [ [doc[x], dictionary[col_names[x]], round(values[x],3)] for x in range(len(col_names))]
list_of_list

In [80]:
%%time
for i, j, k in list_of_list:
    df.loc[i, j] = k

KeyboardInterrupt: 

In [None]:
df

## Pre-processed