In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data/shirt_review.csv')

In [4]:
pd.set_option('display.max_rows',100)

In [5]:
top_review = pd.DataFrame(df['asin'].value_counts()).head(10)

In [6]:
top_review[:9].sum()

asin    28816
dtype: int64

In [7]:
top_review.index

Index(['B000BRETA4', 'B009OCZ84I', 'B001OQ67XI', 'B000BRFH1E', 'B00DIX5SVM',
       'B009LJW5HM', 'B004IZVE6W', 'B00KC3DANE', 'B008FV74N2', 'B0002FHJ66'],
      dtype='object')

In [8]:
meta = pd.read_csv('data/shirt_meta.csv')

In [9]:
title = pd.DataFrame(meta['title'])

In [10]:
# 4 is Hanes Women's Wear Around Nightshirt
# 7 Vobaga Women's Bird Heart Geometric Print Shor..
# 8 Dream USA Men's Casual 3/4 Sleeve Baseball Tsh...

In [11]:
asin = ['B000BRETA4', 'B009OCZ84I', 'B001OQ67XI', 'B000BRFH1E','B009LJW5HM', 'B004IZVE6W']

In [12]:
sweatshirt = df[df['asin'].isin (asin)]

In [13]:
#sweatshirt.to_csv('sweatshirt.csv')

In [14]:
sweatshirt = sweatshirt[['overall','vote','asin','style','reviewText']]

In [15]:
sweatshirt.index = range(0,sweatshirt.shape[0])

In [16]:
sweatshirt.groupby('asin')['overall'].mean()

asin
B000BRETA4    4.389073
B000BRFH1E    4.396355
B001OQ67XI    4.413759
B004IZVE6W    4.255738
B009LJW5HM    4.407365
B009OCZ84I    4.418659
Name: overall, dtype: float64

In [17]:
X = sweatshirt[['overall','asin','reviewText']]

In [18]:
X = X.dropna()

In [19]:
X.shape

(21507, 3)

In [20]:
df_X = pd.DataFrame(X)

In [21]:
len_100 = []
for i in range(df_X.shape[0]):
    if len(df_X['reviewText'].iloc[i])>100:
        len_100.append(i)
    else:
        pass

In [22]:
#df_X.iloc[len_100].to_csv('sweatshirt_100words.csv')

In [23]:
insample,outsample = train_test_split(df, train_size = 1200/df.shape[0],random_state=42)

In [24]:
#insample.to_csv('sweatshirt_sample1200.csv')

### Choose Categories

**TF-IDF with Clustering**

In [25]:
vectorizer = TfidfVectorizer(stop_words = 'english')
X_vector = vectorizer.fit_transform(X['reviewText'])

In [26]:
X_vector = X_vector.toarray()

**KMeans by all reviews**

In [27]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(X_vector)

In [28]:
label = kmeans.labels_

In [29]:
df1 = pd.DataFrame(X_vector)

In [30]:
df1['label'] = label

In [37]:
X_vector

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
np.unique(label,return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32),
 array([10856,  3181,  1046,   992,   960,   770,  1173,  1358,   669,
          502]))

In [32]:
df_feature = pd.DataFrame(vectorizer.get_feature_names())

In [33]:
list1 = []
for i in range(0,10):
    list1.append(df1[df1['label']==i])  # separate each label

In [34]:
features = defaultdict(list)
for i in range(0,10):
    temp = list1[i].drop(columns=['label'])
    temp = pd.DataFrame({'tfidf':temp.max()})
    temp = temp.sort_values(by='tfidf',ascending=False)
    max_tfidf = temp[temp['tfidf']==temp['tfidf'].max()]
    for j in max_tfidf.index:
        features[i].append(df_feature.iloc[j][0])

In [38]:
features[0] # feature: size, material, comfort,color

['bulky',
 'genial',
 'thanks',
 'right',
 'coo',
 'thicker',
 'hoped',
 'returned',
 'white',
 'thank',
 'return',
 'add',
 'gift',
 'hoodie',
 'thumbs',
 'content',
 'hot',
 'cool',
 'dope',
 'wonderful',
 'says',
 'satisfied',
 'amazing',
 'alright',
 'sweet',
 'bs',
 'need',
 'wish',
 'ilikeitalot',
 'bueno',
 'ripped',
 'rip',
 'hshshshshsjs',
 'advertised',
 'noice',
 'holes',
 'eh',
 'want',
 'really',
 'warming',
 'excelent',
 'excelente',
 'goodb',
 'wanted',
 'excellence',
 'torn',
 'excellent',
 'comforable',
 'vien',
 'perefect',
 'quality',
 'exellent',
 'grrreat',
 'gusto',
 'complaints',
 'recomended',
 'happy',
 'harmless',
 'ordering',
 'hdufivncvi',
 'tthank',
 'wavy',
 'order',
 'orange',
 'a1',
 'purchase',
 'confortable',
 'okay',
 'buy',
 'fuzballs',
 'aight',
 'awesome',
 'lightbweight',
 'problems',
 '10',
 'shrink',
 'liked',
 'soso',
 'likes',
 'mailbox',
 'yup',
 'write',
 'shrunk',
 'wrong',
 'soft',
 'snuggly',
 'shame',
 'fits',
 'bad',
 'dafawsfawsfgsgfg'

In [39]:
features[1] # feature: size

['little', 'large', 'small', 'like', 'big', 'ordered']

In [40]:
features[2] # feature: comfort, warm

['comfortable', 'warm']

In [41]:
features[3]

['expected']

In [42]:
features[4]

['nice']

In [43]:
features[5]

['perfect']

In [44]:
features[6]

['good']

In [45]:
features[7]

['great']

In [46]:
features[8]  # feature: comfort

['comfy', 'ok']

In [47]:
features[9]

['love']

In [None]:
# features = [size, material, comfort, warm, color]