In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
import joblib as joblib

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataSet=pd.read_csv('dataSet.csv').iloc[:,1:]
dataSet

Unnamed: 0,Main Category,Category,Product name,Brand,Rating,Price,Link
0,Makeup,Face,Kylighter,KYLIE COSMETICS,3.6,$8.00,https://www.ulta.com/p/kylighter-pimprod2007188
1,Makeup,Face,Shape Tape Concealer,Tarte,4.5,$27.00,https://www.ulta.com/p/shape-tape-concealer-xl...
2,Makeup,Face,Skin Concealer,KYLIE COSMETICS,3.7,$5.00,https://www.ulta.com/p/skin-concealer-pimprod2...
3,Makeup,Face,CC+ Cream with SPF 50+,It Cosmetics,4.3,$39.50,https://www.ulta.com/p/cc-cream-with-spf-50-xl...
4,Makeup,Face,Double Wear Stay-in-Place Makeup,Estée Lauder,4.6,$43.00,https://www.ulta.com/p/double-wear-stay-in-pla...
...,...,...,...,...,...,...,...
15576,Men,Hair,Cedarwood Grooming Paste,Every Man Jack,0.0,$9.99,https://www.ulta.com/p/cedarwood-grooming-past...
15577,Men,Hair,Daily Hydrator Hair Styling Cream,Frederick Benjamin,5.0,$12.00,https://www.ulta.com/p/daily-hydrator-hair-sty...
15578,Men,Gifts & Value Sets,Wicked Beard Trio,Billy Jealousy,4.5,$24.00,https://www.ulta.com/p/wicked-beard-trio-xlsIm...
15579,Men,Gifts & Value Sets,Deluxe Grooming Kit,American Crew,0.0,$23.50,https://www.ulta.com/p/deluxe-grooming-kit-pim...


For training the model, we focused in specificity categories and drop the rest

In [3]:
df_copy=dataSet.copy().drop(columns=['Link','Product name'])

For the "Brand" column, we reduced from 611 brands to 10 leading brands and kept the other brands under the "Others" label. 

In [4]:
fregBrands=dataSet['Brand'].value_counts()[:10].sort_index(ascending=True)
tmp=[]

for i, row in dataSet.iterrows():
    if row['Brand'] in fregBrands: tmp.append(row['Brand']) 
    else: tmp.append('others')
        
df_copy['Brand']=tmp

In this part we will need to replace the string columns with int columns. 

For 'Main Category','Brand' and 'Category' columns we will use replace function for convert values

In [5]:
tmp=[]
for i,row in dataSet.iterrows():
    tmp.append(row['Price'].strip("$"))

df_copy['Price'] = tmp
df_copy['Price'] = pd.to_numeric(df_copy['Price'],errors='coerce')
df_copy=df_copy.dropna()

In [6]:
Brand_replaceMap={'Clinique':1,'Hempz':2,'Morphe':3,'NYX Professional Makeup':4,'Pacifica':5,'Paul Mitchell':6,'Redken':7,
            'Tarte':8,'ULTA':9,'e.l.f. Cosmetics':10,'others':11}
df_copy.replace(Brand_replaceMap,inplace=True)

In [7]:
mainCategory_replaceMap={'Makeup':1,'Nails':2,'Skin Care':3,'Hair':4,'Fragrance':5,'Bath & Body':6,'Men':7}
df_copy.replace(mainCategory_replaceMap,inplace=True)

In [8]:
Category_replaceMap={"Face":1,"Shampoo & Conditioner":2,"Treatment & Serums":5,"Styling Products":6,"Cleansers":7,
 "Lips":8,"Treatment":9,"Women's Fragrance":10,"Makeup Brushes & Tools":11,"Bath & Shower":12,"Suncare":13,"Body Moisturizers":14,
 "Hair Styling Tools":15,"Eye Treatments":16,"Eye Treatments":17,"Accessories":18,"Gifts & Value Sets":19,"Travel Size":20,
 "Hair Brushes & Combs":21,"Hand & Foot Care":22,"Men's Fragrance":23,"Nail Care":24,"Makeup Palettes":25,"Nail Polish":26,
 "Makeup Bags & Organizers":27, "Bath & Body Accessories":28, "Manicure & Pedicure Tools":29, "Hair Color":30, "Shaving":31,
 "Skincare Tools":32, "Press On Nails":33, "Nail Polish Stickers":34, "Supplements":35,"Candles & Home Fragrance":36,"Hair":37,
 "Top & Base Coats":38,"Global Skin Care":39,"Fragrance Gift Sets":40, "ULTA Collection":41,"Body Care":42,"Body Makeup":43,
 "Gel Nail Polish":44,"Skin Care":45, "Self Care & Wellness":46,"Mother & Baby":47,"Kid's Haircare":48, "Nail Art & Design":49,
                     "Eyes":50,"Moisturizers":51}
df_copy.replace(Category_replaceMap,inplace=True)

For the prediction column, the 'Rating' column- We reduced the rating range between 1 to 5 as integers

In [9]:
bins = [0,1,2,3,4,5]
labels = [1,2,3,4,5]
df_copy['Rating'] = pd.cut(df_copy['Rating'], bins, labels=labels)
df_copy=df_copy.dropna()
df_copy

Unnamed: 0,Main Category,Category,Brand,Rating,Price
0,1,1,11,4,8.00
1,1,1,8,5,27.00
2,1,1,11,4,5.00
3,1,1,11,5,39.50
4,1,1,11,5,43.00
...,...,...,...,...,...
15574,7,4,7,5,16.00
15575,7,4,7,5,8.00
15577,7,4,11,5,12.00
15578,7,19,11,5,24.00


The machine learning algorithm used is "KNN"

In [10]:
X = df_copy.drop(columns=['Rating'])
y= pd.Series(df_copy['Rating'])

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=40, test_size=0.2)

scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train)   

# set up the model
k = 3
clf = KNeighborsClassifier(n_neighbors=k)
model=clf.fit(X_train_scaled, y_train)

In [11]:
y_pred=model.predict(X_test)
print('Accuracy = ', metrics.accuracy_score(y_true = y_test, y_pred = y_pred))

Accuracy =  0.7285764253235397


We saving the model into 'MLmodel.csv' file by using joblib.dump function

In [12]:
joblib.dump(model,'MLmodel.csv')

['MLmodel.csv']

Machine learning test: (you can test it by yourself)

In [13]:
str(input("Please enter some details about the product, for continue someting:"))
Main_Category = str(input("Main Category: "))
Category = str(input("Category: "))
Brand = str(input("Brand: "))
Price = str(input("Price: "))

Please enter some details about the product, for continue someting:Ok
Main Category: Makeup
Category: Face
Brand: Terte
Price: 27


In [14]:
try:
    if Brand in Brand_replaceMap: Brand=Brand_replaceMap[Brand] 
    else: Brand=Brand_replaceMap['others'] 
    Main_Category=mainCategory_replaceMap[Main_Category]
    Category=Category_replaceMap[Category]
    Price=int(Price.strip("$"))
except:
    print("ERROR")

In [15]:
X_text=[(Main_Category,Category,Brand,Price)]

y_pred= model.predict(X_test)[0]
print("The model predict rating of: ",y_pred)

The model predict rating of:  5
