In [1]:
import pandas as pd

# Question 2 - Clustering

In [2]:
dt = pd.read_csv('Reviews.csv', sep=",", usecols = [i for i in range(1,10)], nrows=30000)

In [3]:
dt.head()

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Clean up data

In [4]:
dt.isnull().sum()

ProductId                 0
UserId                    0
ProfileName               1
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

There is a very few number of null values. Let's fill them:

In [5]:
dt['ProfileName'].fillna('', inplace=True)
dt['Summary'].fillna('', inplace=True)

In [6]:
dt.isnull().sum()

ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

## Tokenize the text 

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from langdetect import detect

def clean_text(text):
    stop_words = set(stopwords.words('english')) # obtain the stop words
    good_words = [] # save the correct words to consider like tokens
    tokenizer = RegexpTokenizer("[\w']+") # function to recognize the tokens
    words = tokenizer.tokenize(text) # tokenize the text 
    for word in words:
        # check if the word is lower and it isn't a stop word or a number
        if word.lower() not in stop_words and word.isalpha(): 
            word = PorterStemmer().stem(word) # use the stemmer function
            good_words.append(word.lower()) # insert the good token to lower case
        
    return good_words

In [8]:
%%time
dt["Text"] = dt.Text.apply(lambda x: clean_text(x))
dt.head()

Wall time: 40.3 s


Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,"[bought, sever, vital, can, dog, food, product..."
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,"[product, arriv, label, jumbo, salt, peanut, p..."
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all","[confect, around, centuri, light, pillowi, cit..."
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,"[look, secret, ingredi, robitussin, believ, fo..."
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,"[great, taffi, great, price, wide, assort, yum..."


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(preprocessor=lambda x: x, tokenizer=lambda x: x)
tfidf_matrix = tfidf.fit_transform(dt["Text"])

In [41]:
print(tfidf_matrix)

  (0, 784)	0.24748369376899845
  (0, 6406)	0.2844541302693669
  (0, 9690)	0.3506750275376341
  (0, 1573)	0.2717980505920156
  (0, 16064)	0.1758481296149753
  (0, 10733)	0.20828557197731726
  (0, 13758)	0.22417162903201923
  (0, 16715)	0.305427950904661
  (0, 10014)	0.08894733389566188
  (0, 10185)	0.14237168547998658
  (0, 14057)	0.16818308261767748
  (0, 7393)	0.09754734353955415
  (0, 6743)	0.14751690159582934
  (0, 13775)	0.3129872790711956
  (0, 6642)	0.12999374272860018
  (0, 5075)	0.13882650836197213
  (0, 2476)	0.19465786510000369
  (0, 19112)	0.35721589900848655
  (0, 15563)	0.18336830087840744
  (0, 1953)	0.14883311545887107
  (1, 14598)	0.27864537349229535
  (1, 8938)	0.2383967348170606
  (1, 18945)	0.22866446815217706
  (1, 5798)	0.2638253062008541
  (1, 17114)	0.140392702594762
  :	:
  (29999, 2924)	0.18216086464568282
  (29999, 15694)	0.15365972971998346
  (29999, 19704)	0.13875034140329554
  (29999, 11148)	0.2726220667086528
  (29999, 3511)	0.1434307115569506
  (29999, 48

In [29]:
c = tfidf_matrix.tocoo()                                                                                        
df = pd.DataFrame(c)                                  
df.head() 

TypeError: 'coo_matrix' object is not subscriptable