In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [13]:
#import data
reviews = pd.read_csv('reviews.csv')
 
#print column names
print(reviews.columns)

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating'],
      dtype='object')


In [14]:
#print .info
print(reviews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      5000 non-null   int64 
 1   age              5000 non-null   int64 
 2   review_title     4174 non-null   object
 3   review_text      4804 non-null   object
 4   recommended      5000 non-null   bool  
 5   division_name    4996 non-null   object
 6   department_name  4996 non-null   object
 7   review_date      5000 non-null   object
 8   rating           5000 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 317.5+ KB
None


In [15]:
#look at the counts of recommended
print(reviews.recommended.value_counts())

recommended
True     4166
False     834
Name: count, dtype: int64


In [16]:
#create binary dictionary
binary_dict = {True:1, False:0}

#transform column
reviews.recommended = reviews.recommended.map(binary_dict)
 
#print your transformed column
print(reviews.recommended.value_counts())


recommended
1    4166
0     834
Name: count, dtype: int64


In [17]:
#look at the counts of rating
print(reviews.rating.value_counts())

rating
Loved it     2798
Liked it     1141
Was okay      564
Not great     304
Hated it      193
Name: count, dtype: int64


In [18]:
#create dictionary
rating_dict = {'Loved it':5, 'Liked it':4, 'Was okay':3, 'Not great':2, 'Hated it': 1}

#transform rating column
reviews.rating = reviews.rating.map(rating_dict)

#print your transformed column values
print(reviews.rating.value_counts())

rating
5    2798
4    1141
3     564
2     304
1     193
Name: count, dtype: int64


In [19]:
#get the number of categories in a feature
print(reviews.department_name)

#perform get_dummies
one_hot = pd.get_dummies(reviews.department_name)

#join the new columns back onto the original
reviews = reviews.join(one_hot)

#print column names
print(reviews.columns)

0        Dresses
1        Dresses
2       Intimate
3        Dresses
4        Dresses
          ...   
4995        Tops
4996        Tops
4997     Dresses
4998     Bottoms
4999        Tops
Name: department_name, Length: 5000, dtype: object
Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating', 'Bottoms',
       'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
      dtype='object')


In [20]:
#transform review_date to date-time data
reviews['date'] = pd.to_datetime(reviews.review_date)

#print review_date data type 
print(reviews['date'].dtype)
reviews['year'] = reviews.date.dt.year

datetime64[ns]


In [21]:
#get numerical columns
reviews = reviews[['clothing_id', 'age', 'recommended', 'rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()

#reset index
reviews = reviews.set_index('clothing_id')
print(reviews.info())

#instantiate standard scaler
scaler = StandardScaler()
#fit transform data
result = scaler.fit_transform(reviews)
print(result)

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 1095 to 850
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   age          5000 non-null   int64
 1   recommended  5000 non-null   int64
 2   rating       5000 non-null   int64
 3   Bottoms      5000 non-null   bool 
 4   Dresses      5000 non-null   bool 
 5   Intimate     5000 non-null   bool 
 6   Jackets      5000 non-null   bool 
 7   Tops         5000 non-null   bool 
 8   Trend        5000 non-null   bool 
dtypes: bool(6), int64(3)
memory usage: 185.5 KB
None
[[-0.34814459  0.44742824 -0.1896478  ... -0.21656679 -0.88496718
  -0.07504356]
 [-1.24475223  0.44742824  0.71602461 ... -0.21656679 -0.88496718
  -0.07504356]
 [-0.51116416  0.44742824  0.71602461 ... -0.21656679 -0.88496718
  -0.07504356]
 ...
 [-0.59267395  0.44742824  0.71602461 ... -0.21656679 -0.88496718
  -0.07504356]
 [-1.24475223  0.44742824  0.71602461 ... -0.21656679 -0.88496718
  -0.075