In [55]:
import pandas as pd
import numpy as np

# Question 2 - Clustering

In [106]:
dt = pd.read_csv('Reviews.csv', sep=",", usecols = [i for i in range(1,10)])

In [107]:
dt.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [108]:
dt.describe()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0
mean,1.743817,2.22881,4.183199,1296257000.0
std,7.636513,8.28974,1.310436,48043310.0
min,0.0,0.0,1.0,939340800.0
25%,0.0,0.0,4.0,1271290000.0
50%,0.0,1.0,5.0,1311120000.0
75%,2.0,2.0,5.0,1332720000.0
max,866.0,923.0,5.0,1351210000.0


## Clean up data

In [109]:
dt.isnull().sum()

ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

There is a very few number of null values. Let's fill them:

In [110]:
dt['ProfileName'].fillna('', inplace=True)
dt['Summary'].fillna('', inplace=True)

In [111]:
dt.isnull().sum()

ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

## Tokenize the text 

In [112]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from langdetect import detect

def clean_text(text):
    stop_words = set(stopwords.words('english')) # obtain the stop words
    good_words = [] # save the correct words to consider like tokens
    tokenizer = RegexpTokenizer("[\w']+") # function to recognize the tokens
    words = tokenizer.tokenize(text) # tokenize the text 
    for word in words:
        # check if the word is lower and it isn't a stop word or a number
        if word.lower() not in stop_words and word.isalpha(): 
            word = PorterStemmer().stem(word) # use the stemmer function
            good_words.append(word.lower()) # insert the good token to lower case
        
    return good_words

In [115]:
%%time
# use the interest column..
dt2 = dt.copy()
dt2 = dt2[["Text"]]
dt2["Text"] = [ clean_text(x) for x in list(dt2["Text"])]
dt2.head()

Wall time: 12min 28s


Unnamed: 0,Text
0,"[bought, sever, vital, can, dog, food, product..."
1,"[product, arriv, label, jumbo, salt, peanut, p..."
2,"[confect, around, centuri, light, pillowi, cit..."
3,"[look, secret, ingredi, robitussin, believ, fo..."
4,"[great, taffi, great, price, wide, assort, yum..."


In [116]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
tfidf_matrix = tfidf.fit_transform(dt2["Text"])

Wall time: 11.4 s


In [117]:
%%time
dn = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix)

Wall time: 18.9 s


In [118]:
dn

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,74345,74346,74347,74348,74349,74350,74351,74352,74353,74354
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
568450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
568451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
568452,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [144]:
import math
dn2 = dn.copy()



print(np.quantile(values, .25))

0.14663775149758526
0.09760838588103153
0.18748458748660163
0.12177068811868613
0.0782870038231037


## One long.. solution

In [66]:
dictionary = tfidf.vocabulary_
dictionary = dict(zip(dictionary.values(), dictionary.keys()))
dictionary

{1953: 'bought',
 15563: 'sever',
 19112: 'vital',
 2476: 'can',
 5075: 'dog',
 6642: 'food',
 13775: 'product',
 6743: 'found',
 7393: 'good',
 14057: 'qualiti',
 10185: 'look',
 10014: 'like',
 16715: 'stew',
 13758: 'process',
 10733: 'meat',
 16064: 'smell',
 1573: 'better',
 9690: 'labrador',
 6406: 'finicki',
 784: 'appreci',
 885: 'arriv',
 9682: 'label',
 9319: 'jumbo',
 15134: 'salt',
 12844: 'peanut',
 150: 'actual',
 16046: 'small',
 15912: 'size',
 18692: 'unsalt',
 17114: 'sure',
 5798: 'error',
 18945: 'vendor',
 8938: 'intend',
 14598: 'repres',
 3665: 'confect',
 876: 'around',
 2811: 'centuri',
 9999: 'light',
 13175: 'pillowi',
 3242: 'citru',
 7112: 'gelatin',
 11968: 'nut',
 2671: 'case',
 6373: 'filbert',
 4250: 'cut',
 17837: 'tini',
 16538: 'squar',
 9965: 'liber',
 3386: 'coat',
 13540: 'powder',
 16984: 'sugar',
 11368: 'mouth',
 8052: 'heaven',
 3000: 'chewi',
 6496: 'flavor',
 8175: 'highli',
 14356: 'recommend',
 19964: 'yummi',
 18084: 'treat',
 6162: 'fami

In [77]:
%%time

tfidf_matrix = tfidf_matrix.tocoo()
values = list(tfidf_matrix.data)
col_names = list(tfidf_matrix.col)

df = pd.DataFrame(0.000, index=np.arange(len(dt)), columns = [dictionary[x] for x in dictionary.keys()])
df

Wall time: 2.32 s


Unnamed: 0,bought,sever,vital,can,dog,food,product,found,good,qualiti,...,sunfood,tablesalt,unquestion,biofuel,ike,tempertur,wasten,cornstart,suppermarket,verstil
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
%%time
values = list(tfidf_matrix.data)
col_names = list(tfidf_matrix.col)
doc = list(tfidf_matrix.row)
list_of_list = [ [doc[x], dictionary[col_names[x]], round(values[x],3)] for x in range(len(col_names))]
list_of_list

Wall time: 5.41 s


[[0, 'appreci', 0.247],
 [0, 'finicki', 0.284],
 [0, 'labrador', 0.351],
 [0, 'better', 0.272],
 [0, 'smell', 0.176],
 [0, 'meat', 0.208],
 [0, 'process', 0.224],
 [0, 'stew', 0.305],
 [0, 'like', 0.089],
 [0, 'look', 0.142],
 [0, 'qualiti', 0.168],
 [0, 'good', 0.098],
 [0, 'found', 0.148],
 [0, 'product', 0.313],
 [0, 'food', 0.13],
 [0, 'dog', 0.139],
 [0, 'can', 0.195],
 [0, 'vital', 0.357],
 [0, 'sever', 0.183],
 [0, 'bought', 0.149],
 [1, 'repres', 0.279],
 [1, 'intend', 0.238],
 [1, 'vendor', 0.229],
 [1, 'error', 0.264],
 [1, 'sure', 0.14],
 [1, 'unsalt', 0.278],
 [1, 'size', 0.139],
 [1, 'small', 0.139],
 [1, 'actual', 0.143],
 [1, 'peanut', 0.34],
 [1, 'salt', 0.148],
 [1, 'jumbo', 0.601],
 [1, 'label', 0.185],
 [1, 'arriv', 0.156],
 [1, 'product', 0.175],
 [2, 'sister', 0.143],
 [2, 'brother', 0.145],
 [2, 'sell', 0.108],
 [2, 'edmund', 0.22],
 [2, 'seduc', 0.228],
 [2, 'wardrob', 0.213],
 [2, 'witch', 0.456],
 [2, 'lion', 0.201],
 [2, 'c', 0.121],
 [2, 'stori', 0.143],
 [2,

In [80]:
%%time
for i, j, k in list_of_list:
    df.loc[i, j] = k

KeyboardInterrupt: 

In [None]:
df

## Pre-processed