In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

In [15]:
data = pd.read_csv('NLP_Data.csv')

In [16]:
comments = data[['Website','Brand', 'Comment#1', 'Comment#2']]
comments.head(2)

Unnamed: 0,Website,Brand,Comment#1,Comment#2
0,lowes,General Electric,Functional,Pros: fingerprint resistant so you don't have ...
1,lowes,Frigidaire,Ample Door Storage User Friendly Visibility,Feels solid and “upscale”. Excellent design of...


### 1.1 - Limpando Valores Nulos

In [17]:
comments.fillna('', inplace=True)
comments.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Website      0
Brand        0
Comment#1    0
Comment#2    0
dtype: int64

### 1.2 - Juntar os dois tipos de comentário em um só

In [18]:
corpus =  comments.copy()

col_names = ['Comment#1', 'Comment#2']
corpus['Full_Comment'] = corpus.apply(lambda x: x[col_names[0]] + ' ' + x[col_names[1]], axis=1)
corpus.head(5)

Unnamed: 0,Website,Brand,Comment#1,Comment#2,Full_Comment
0,lowes,General Electric,Functional,Pros: fingerprint resistant so you don't have ...,Functional Pros: fingerprint resistant so you ...
1,lowes,Frigidaire,Ample Door Storage User Friendly Visibility,Feels solid and “upscale”. Excellent design of...,Ample Door Storage User Friendly Visibility Fe...
2,bestbuy_us,Whirlpool,So much room,I love this fridge. So much room over having a...,So much room I love this fridge. So much room ...
3,bestbuy_us,Whirlpool,So much room,I love this fridge. So much room over having a...,So much room I love this fridge. So much room ...
4,bestbuy_us,Whirlpool,So much room,I love this fridge. So much room over having a...,So much room I love this fridge. So much room ...


### 1.3 - Limpar o Texto. Pontuações, tudo em letra minúscula, stopwords, etc.

In [19]:
import re
import string
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [21]:
col_name = ['Full_Comment']
corpus['Full_Comment'] = corpus.apply(lambda x: clean_text(x[col_name[0]]), axis=1 )
corpus['Full_Comment'][0]

'functional pros fingerprint resistant so you dont have to constantly wipe it down drawers are good size and are plentiful water dispenser works great cons its noisy when making ice to the point you think one thing is wrong with it'

In [22]:
count_vector = CountVectorizer(stop_words='english')
words_count_data = count_vector.fit_transform(corpus.Full_Comment)
document_term_matrix = pd.DataFrame(words_count_data.toarray(), columns=count_vector.get_feature_names())
document_term_matrix.index = corpus.index
document_term_matrix.head()

Unnamed: 0,aaa,aahs,abd,abilities,ability,able,ablity,abnormal,abnormally,abound,aboveand,abrasive,abs,absolute,absolutely,absolutley,absolutly,absorb,absorbed,absurd,abundance,abundant,abused,ac,accents,accept,acceptable,accepted,acces,access,accessed,accessibility,accessible,accessing,accessories,accessory,accidental,accidentally,accidentallyyou,accidently,...,yearsslow,yellow,yep,yes,yesi,yesterday,yesterdayworking,yetand,yetit,yetlike,yielded,yikes,yin,yknow,yo,yogurt,yogurts,youd,youer,youl,youll,young,younger,youre,youth,youtube,youve,yoy,yr,yrs,yup,zero,zipper,zombie,zone,zones,zonked,ótima,ótimo,ótimogela
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
document_term_matrix.shape

(20473, 9561)

Tenho muitos valores. Tenho que de alguma forma melhorar isso, porque senão vai ser bem dificil trabalhar com uma tabela tão grande.

Algumas ideias:
- Agrupar por Site, e diminuir o numero de Linhas
- Stemming/Lemmanization (tenho palavras bem parecida como abilities, ability, ablity)
- Vejo que tenho palavras em portugues (ótimo, ótima) -> Posso juntar as stop words das duas linguas talvez

### 1.3.1 - Agrupando por site, e pegando o site com mais linhas que é o lowes

In [24]:
lowes_comments = corpus.copy()
lowes_comments = lowes_comments.loc[lowes_comments['Website'] == 'lowes']
lowes_comments.reset_index(inplace=True)
lowes_comments.head()

Unnamed: 0,index,Website,Brand,Comment#1,Comment#2,Full_Comment
0,0,lowes,General Electric,Functional,Pros: fingerprint resistant so you don't have ...,functional pros fingerprint resistant so you d...
1,1,lowes,Frigidaire,Ample Door Storage User Friendly Visibility,Feels solid and “upscale”. Excellent design of...,ample door storage user friendly visibility fe...
2,6,lowes,Frigidaire,My refrigerator stopped working in less than a...,We were away from the weekend and the frig wen...,my refrigerator stopped working in less than a...
3,7,lowes,Frigidaire,My refrigerator stopped working in less than a...,We were away from the weekend and the frig wen...,my refrigerator stopped working in less than a...
4,11,lowes,Whirlpool,Good Product,We bought this product for our office and they...,good product we bought this product for our of...


In [25]:
count_vector = CountVectorizer(stop_words='english')
words_count_data = count_vector.fit_transform(lowes_comments.Full_Comment)
lowes_dtm = pd.DataFrame(words_count_data.toarray(), columns=count_vector.get_feature_names())
lowes_dtm.index = lowes_comments.index
lowes_dtm.head()

Unnamed: 0,aaa,aahs,abd,ability,able,ablity,abnormal,abnormally,abound,abrasive,abs,absolute,absolutely,absolutley,absorb,absorbed,abused,accents,accept,acceptable,access,accessed,accessibility,accessible,accessing,accessories,accessory,accidental,accidentally,accidently,accommodate,accommodates,accommodating,accomodate,according,account,accrue,accumulate,accumulates,accumulation,...,writing,written,wrong,wrote,xl,xtra,ya,yank,yard,yay,yea,yeah,year,yearit,yearold,years,yearsslow,yes,yesi,yesterday,yesterdayworking,yetand,yin,yo,yogurt,yogurts,youd,youer,youl,youll,young,younger,youre,youtube,yr,yrs,zero,zipper,zone,zones
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
lowes_dtm.sum().sort_values(ascending=False).head(30)

fridge          6987
love            5170
refrigerator    4796
great           4645
ice             4563
freezer         3683
door            3490
space           3247
water           2384
bought          2342
like            2228
room            1974
good            1938
maker           1845
new             1830
features        1526
nice            1516
just            1405
ago             1404
old             1303
shelves         1282
size            1254
inside          1223
happy           1101
far             1094
month           1094
drawer          1035
really          1034
doors           1019
perfect          981
dtype: int64

### 1.3.2 - SnowBallStemming

In [27]:
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [28]:
def text_stem(text, stemmer):
    word_list = word_tokenize(text)
    text = [stemmer.stem(word) for word in word_list]
    text = ' '.join(text)
    return text

In [31]:
snowBallStemmer = SnowballStemmer("english")
col_name = ['Full_Comment']
lowes_comments['Full_Comment'] = lowes_comments.apply(lambda x: text_stem(x[col_name[0]], snowBallStemmer), axis=1 )
lowes_comments['Full_Comment'].head()

0    function pros fingerprint resist so you dont h...
1    ampl door storag user friend visibl feel solid...
2    my refriger stop work in less than a mon we we...
3    my refriger stop work in less than a mon we we...
4    good product we bought this product for our of...
Name: Full_Comment, dtype: object

In [32]:
count_vector = CountVectorizer(stop_words='english')
words_stem_count_data = count_vector.fit_transform(lowes_comments.Full_Comment)
dtm_stem = pd.DataFrame(words_stem_count_data.toarray(), columns=count_vector.get_feature_names())
dtm_stem.index = lowes_comments.index
dtm_stem.head()

Unnamed: 0,aaa,aah,ab,abd,abil,abl,abliti,abnorm,abound,abov,abras,absolut,absolutley,absorb,abus,accent,accept,access,accessori,accid,accident,accommod,accomod,accord,account,accru,accumul,accur,accuraci,accustom,acguir,act,action,activ,actual,actuat,ad,adapt,add,addit,...,wowwhat,wrap,wrench,wrinkl,write,written,wrong,wrote,xl,xtra,ya,yank,yard,yay,yea,yeah,year,yearit,yearold,yearsslow,yes,yesi,yesterday,yesterdaywork,yetand,yin,yo,yogurt,youd,youer,youl,youll,young,younger,youtub,yr,yrs,zero,zipper,zone
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
dtm_stem.shape

(9518, 4280)

In [34]:
dtm_stem.sum().sort_values(ascending=False).head(30)

fridg       7110
love        5418
refriger    5004
great       4671
ice         4593
door        4509
freezer     3702
space       3327
veri        2977
water       2385
like        2343
bought      2342
look        2087
featur      1994
room        1977
good        1963
maker       1909
new         1830
work        1794
onli        1738
month       1725
drawer      1673
lot         1624
nice        1574
shelv       1523
need        1414
just        1405
ago         1404
size        1339
purchas     1320
dtype: int64

### 1.3.2 - PorterStemming

In [35]:
from nltk.stem import PorterStemmer

In [36]:
porterStemmer = PorterStemmer()
col_name = ['Full_Comment']
lowes_comments['Full_Comment'] = lowes_comments.apply(lambda x: text_stem(x[col_name[0]], porterStemmer), axis=1 )
lowes_comments['Full_Comment'].head()

0    function pro fingerprint resist so you dont ha...
1    ampl door storag user friend visibl feel solid...
2    my refrig stop work in less than a mon we were...
3    my refrig stop work in less than a mon we were...
4    good product we bought thi product for our off...
Name: Full_Comment, dtype: object

In [37]:
count_vector = CountVectorizer(stop_words='english')
words_stem_count_data = count_vector.fit_transform(lowes_comments.Full_Comment)
dtm_stem_porter = pd.DataFrame(words_stem_count_data.toarray(), columns=count_vector.get_feature_names())
dtm_stem_porter.index = lowes_comments.index
dtm_stem_porter.head()

Unnamed: 0,aaa,aah,ab,abd,abil,abl,abliti,abnorm,abound,abov,abra,absolut,absolutley,absorb,abu,accent,accept,access,accessori,accid,accommod,accomod,accord,account,accru,accumul,accur,accuraci,accustom,acguir,act,action,activ,actual,actuat,ad,adapt,add,addit,additt,...,wow,wowwhat,wrap,wrench,wrinkl,write,written,wrong,wrote,xl,xtra,ya,yank,yard,yay,ye,yea,yeah,year,yearit,yearold,yearsslow,yesi,yesterday,yesterdaywork,yetand,yin,yo,yogurt,youd,youer,youl,youll,young,younger,youtub,yr,zero,zipper,zone
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
dtm_stem_porter.sum().sort_values(ascending=False).head(30)

thi        7927
fridg      7110
love       5418
refrig     5133
great      4671
ice        4593
door       4509
freezer    3702
space      3327
veri       2977
wa         2715
water      2527
like       2343
bought     2342
ha         2214
look       2087
featur     1994
room       1977
good       1963
maker      1911
new        1830
work       1794
onli       1738
month      1725
drawer     1673
lot        1624
nice       1577
shelv      1523
need       1414
just       1405
dtype: int64

Não Gostei dessa Porter Stemming, e apesar de ter ficado menor, o SnowBall nao me agradou muito também.

Vou continuar com a ideia de só pegar os comentários da Lowes e fazer sem stemming mesmo.

### 1.4 - Bigramas

In [39]:
from nltk.util import bigrams, trigrams

In [40]:
lowes_comments = corpus.copy()
lowes_comments = lowes_comments.loc[lowes_comments['Website'] == 'lowes']
lowes_comments.reset_index(inplace=True)
lowes_comments.head()

Unnamed: 0,index,Website,Brand,Comment#1,Comment#2,Full_Comment
0,0,lowes,General Electric,Functional,Pros: fingerprint resistant so you don't have ...,functional pros fingerprint resistant so you d...
1,1,lowes,Frigidaire,Ample Door Storage User Friendly Visibility,Feels solid and “upscale”. Excellent design of...,ample door storage user friendly visibility fe...
2,6,lowes,Frigidaire,My refrigerator stopped working in less than a...,We were away from the weekend and the frig wen...,my refrigerator stopped working in less than a...
3,7,lowes,Frigidaire,My refrigerator stopped working in less than a...,We were away from the weekend and the frig wen...,my refrigerator stopped working in less than a...
4,11,lowes,Whirlpool,Good Product,We bought this product for our office and they...,good product we bought this product for our of...


In [41]:
count_vector = CountVectorizer(stop_words='english', ngram_range=(2,2))
words_count_data = count_vector.fit_transform(lowes_comments.Full_Comment)
bigrams_dtm = pd.DataFrame(words_count_data.toarray(), columns=count_vector.get_feature_names())
bigrams_dtm.index = lowes_comments.index
bigrams_dtm.head()

Unnamed: 0,aaa batteries,aahs family,abd gets,ability adjust,ability change,ability connect,ability control,ability customize,ability easily,ability ice,ability list,ability place,ability play,ability set,ability shelves,ability store,able adjust,able arrange,able big,able buff,able buy,able change,able clean,able come,able compartments,able contents,able delivered,able easier,able easily,able exact,able figure,able fit,able fix,able food,able french,able fridge,able grab,able handle,able heavy,able hold,...,youre filling,youre going,youre handy,youre looking,youre stuck,youre using,youtube cooking,youtube definitely,youtube videos,yr extended,yr old,yrs ago,yrs bought,yrs fridge,yrs happy,yrs hope,yrs old,yrs proposed,yrs saw,yrs service,yrs time,yrs whirlpool,zero degrees,zero help,zero leaks,zero post,zero purchased,zero slot,zero star,zero stars,zero tempature,zipper left,zone cold,zone drawer,zone extra,zone haven,zone helps,zone tendency,zone want,zones nice
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [42]:
bigrams_dtm.sum().sort_values(ascending=False).head(50)

ice maker               1751
month ago                578
great features           507
great fridge             478
new fridge               462
water dispenser          452
lots room                420
french door              417
love fridge              387
water ice                373
ice water                368
weeks ago                350
bought month             339
works great              338
plenty room              314
love new                 304
fridge freezer           301
bought fridge            300
counter depth            293
new refrigerator         287
really like              277
stainless steel          273
bought refrigerator      271
looks great              263
fridge love              259
great refrigerator       259
old fridge               252
months ago               246
far good                 245
lots space               227
fridge bought            224
ago love                 220
year old                 210
love refrigerator        204
refrigerator b

Tá gerando muita palavra, quase 50.000. Vai crashar minha memória ram.

Vou tentar trabalhar com alguma marca. Vou pegar a Samsumg que é mais conhecida e tem só 1800 produtos

In [44]:
samsung_comments = corpus.loc[corpus['Brand'] == 'Samsung']
samsung_comments.reset_index(inplace=True)
samsung_comments.head()

Unnamed: 0,index,Website,Brand,Comment#1,Comment#2,Full_Comment
0,24,bestbuy_us,Samsung,Average,Disappointed by the water dispenser and the ic...,average disappointed by the water dispenser an...
1,25,bestbuy_us,Samsung,Average,Disappointed by the water dispenser and the ic...,average disappointed by the water dispenser an...
2,26,bestbuy_us,Samsung,Average,Disappointed by the water dispenser and the ic...,average disappointed by the water dispenser an...
3,51,bestbuy_us,Samsung,Great fridge BUT,I’ve only had this fridge for about 5 months a...,great fridge but i’ve only had this fridge for...
4,61,homedepot,Samsung,It’s gorgeous and very spacious. I bought it a...,It’s gorgeous and very spacious. \r\n\r\nI bou...,it’s gorgeous and very spacious i bought it as...


In [45]:
count_vector = CountVectorizer(stop_words='english', ngram_range=(2,2))
words_count_data = count_vector.fit_transform(samsung_comments.Full_Comment)
samsung_dtm = pd.DataFrame(words_count_data.toarray(), columns=count_vector.get_feature_names())
samsung_dtm.index = samsung_comments.index
samsung_dtm.head()

Unnamed: 0,aaa batteries,ability beer,ability change,ability fit,ability list,ability listen,ability organize,ability play,ability produce,ability repaired,ability stay,ability temperature,ability turn,able access,able actually,able adjust,able arrange,able buff,able bunch,able connect,able contents,able figure,able fit,able fold,able grab,able half,able listen,able load,able look,able make,able pack,able program,able pull,able refrigerator,able remove,able store,able stream,able whats,absolute beloved,absolute excitement,...,youtube keeps,youtube needed,youtube refrigerator,youtube smart,youve fridge,yr old,yr warranty,yrs ago,yrs linesturdy,yrs old,yrs previously,yrs saw,yup service,zero complaints,zero degrees,zero help,zero ice,zero issues,zero problems,zero stars,zero tempature,zipper left,zone add,zone arent,zone awesome,zone beer,zone change,zone compartment,zone drawer,zone drawers,zone extra,zone freezer,zone haven,zone helps,zone refrigerator,zone use,zone used,zone want,zones nice,zonked year
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [46]:
samsung_dtm.sum().sort_values(ascending=False).head(30)

ice maker               478
great fridge            178
stainless steel         167
family hub              166
samsung refrigerator    136
french door             133
water pitcher           128
counter depth           122
water dispenser         119
crushed ice             118
great refrigerator      104
love fridge             101
samsung fridge           98
water ice                90
ice water                90
love new                 88
flex zone                86
new samsung              84
love love                83
flex drawer              81
food showcase            79
cu ft                    77
fridge freezer           71
really like              68
new refrigerator         66
showcase door            64
love refrigerator        63
months ago               62
fridge love              60
new fridge               60
dtype: int64