In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, BaggingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_squared_error, f1_score
import numpy as np
from sklearn.metrics import r2_score

In [2]:
products = pd.read_csv('./data/products_clean_eda.csv')
products.head(3)

Unnamed: 0,name,brand,category,price,ingredients,no_reviews,hearts,size1,size2,url,final_size,price_per_ounce
0,Protini™ Polypeptide Moisturizer,Drunk Elephant,moisturizing-cream-oils-mists,68.0,"Dicaprylyl Carbonate, Glycerin, Cetearyl Alcoh...",3000,216935,1.69,0.0,https://www.sephora.com/product/protini-tm-pol...,1.69,40.236686
1,The Water Cream,Tatcha,moisturizing-cream-oils-mists,68.0,"Dicaprylyl Carbonate, Glycerin, Cetearyl Alcoh...",2000,197492,0.0,1.7,https://www.sephora.com/product/the-water-crea...,1.7,40.0
2,Ultra Facial Cream,Kiehl's Since 1851,moisturizing-cream-oils-mists,32.0,"Aqua, Cyclohexasiloxane, Squalane, BisPEG18 Me...",943,87617,0.0,1.7,https://www.sephora.com/product/ultra-facial-c...,1.7,18.823529


## Regression

### CountVectorizer

In [3]:
X = products['ingredients'] #setting our X variable

y = products['price_per_ounce'] #our y variable is price_per_ounce

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42,  test_size=.33)

In [4]:
#CountVactorizer splits the ingredients by comma and not in to individual words
cvec = CountVectorizer(tokenizer=lambda x: x.split(', '))
#training CountVectorizer and transforming
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [5]:
#instantiating models
lr = LinearRegression()
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor(random_state=42)
bdt = BaggingRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators = 10)
abr = AdaBoostRegressor()
svr = SVR(gamma = 'scale')

models = {
    'Linear Regression': lr,
    'KNN': knn, 
    'Decision Tree': dt, 
    'Bagging': bdt, 
    'Random Forest': rf, 
    'AdaBoost': abr,
    'SVR': svr
}
#list of model names
model_names = ['Linear Regression', 'KNN', 'Decision Tree', 'Bagging', 'Random Forest', 'AdaBoost', 'SVR']
#creating a data frame that rmse scores will go in to
rmse_all_cvec = pd.DataFrame(index = model_names, columns = ['Train', 'Test', 'Train R^2', 'Test R^2'])
#fitting the models and calculating their rmse
for x in models:
    models[x].fit(X_train_cvec, y_train) #fitting the model
    y_train_preds = models[x].predict(X_train_cvec) #prediction for train
    y_test_preds = models[x].predict(X_test_cvec) #prediction for test
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_preds)) #rmse for train
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_preds)) #rmse for test
    rmse_all_cvec.loc[x, 'Train'] = rmse_train #updating the dataframe
    rmse_all_cvec.loc[x, 'Test'] = rmse_test
    rmse_all_cvec.loc[x, 'Train R^2'] = models[x].score(X_train_cvec, y_train)
    rmse_all_cvec.loc[x, 'Test R^2'] = models[x].score(X_test_cvec, y_test)
    
rmse_all_cvec

Unnamed: 0,Train,Test,Train R^2,Test R^2
Linear Regression,33.4644,94.3517,0.814715,-0.766772
KNN,66.4481,73.7517,0.26947,-0.0795064
Decision Tree,33.4227,84.807,0.815178,-0.427397
Bagging,41.4883,67.2987,0.71521,0.101135
Random Forest,40.1477,72.6716,0.733317,-0.0481196
AdaBoost,75.1839,81.5677,0.0647611,-0.320436
SVR,80.4214,72.8766,-0.0700781,-0.0540414


### TfidfVectorizer

In [6]:
X = products['ingredients'] #setting our X variable

y = products['price_per_ounce'] #our y variable is price_per_ounce

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42,  test_size=.33)

In [7]:
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(', '))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [8]:
#instantiating models
lr = LinearRegression()
knn = KNeighborsRegressor()
dt = DecisionTreeRegressor(random_state=42)
bdt = BaggingRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators = 10)
abr = AdaBoostRegressor()
svr = SVR(gamma = 'scale')

models = {
    'Linear Regression': lr,
    'KNN': knn, 
    'Decision Tree': dt, 
    'Bagging': bdt, 
    'Random Forest': rf, 
    'AdaBoost': abr,
    'SVR': svr
}
#list of model names
model_names = ['Linear Regression', 'KNN', 'Decision Tree', 'Bagging', 'Random Forest', 'AdaBoost', 'SVR']
#creating a data frame that rmse scores will go in to
rmse_all_tfidf = pd.DataFrame(index = model_names, columns = ['Train', 'Test', 'Train R^2', 'Test R^2'])
#fitting the models and calculating their rmse
for x in models:
    models[x].fit(X_train_tfidf, y_train) #fitting the model
    y_train_preds = models[x].predict(X_train_tfidf) #prediction for train
    y_test_preds = models[x].predict(X_test_tfidf) #prediction for test
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_preds)) #rmse for train
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_preds)) #rmse for test
    rmse_all_tfidf.loc[x, 'Train'] = rmse_train #updating the dataframe
    rmse_all_tfidf.loc[x, 'Test'] = rmse_test
    rmse_all_tfidf.loc[x, 'Train R^2'] = models[x].score(X_train_tfidf, y_train)
    rmse_all_tfidf.loc[x, 'Test R^2'] = models[x].score(X_test_tfidf, y_test)
    
rmse_all_tfidf

Unnamed: 0,Train,Test,Train R^2,Test R^2
Linear Regression,33.4227,1158.2,0.815178,-265.226
KNN,60.6398,70.6506,0.391601,0.00936611
Decision Tree,33.4227,86.3226,0.815178,-0.47887
Bagging,41.605,68.7862,0.713607,0.0609601
Random Forest,42.0965,68.7207,0.7068,0.0627472
AdaBoost,78.6702,85.7308,-0.023984,-0.458662
SVR,80.6641,73.2724,-0.076547,-0.0655211


## Classification

In [9]:
#creating a classification varible 'class' to use as our y variable
for i in products.index:
    if products.loc[i, 'price_per_ounce'] < 25:
        products.loc[i, 'class'] = 0
    elif ((products.loc[i, 'price_per_ounce'] >= 25) & (products.loc[i, 'price_per_ounce'] < 50)):
        products.loc[i, 'class'] = 1
    elif ((products.loc[i, 'price_per_ounce'] >= 50) & (products.loc[i, 'price_per_ounce'] < 100)):
        products.loc[i, 'class'] = 2
#     elif ((products.loc[i, 'price_per_ounce'] >= 100) & (products.loc[i, 'price_per_ounce'] < 150)):
#         products.loc[i, 'class'] = 3
    else:
        products.loc[i, 'class'] = 3

In [10]:
products['class'] = products['class'].astype(int)
products.head(3)

Unnamed: 0,name,brand,category,price,ingredients,no_reviews,hearts,size1,size2,url,final_size,price_per_ounce,class
0,Protini™ Polypeptide Moisturizer,Drunk Elephant,moisturizing-cream-oils-mists,68.0,"Dicaprylyl Carbonate, Glycerin, Cetearyl Alcoh...",3000,216935,1.69,0.0,https://www.sephora.com/product/protini-tm-pol...,1.69,40.236686,1
1,The Water Cream,Tatcha,moisturizing-cream-oils-mists,68.0,"Dicaprylyl Carbonate, Glycerin, Cetearyl Alcoh...",2000,197492,0.0,1.7,https://www.sephora.com/product/the-water-crea...,1.7,40.0,1
2,Ultra Facial Cream,Kiehl's Since 1851,moisturizing-cream-oils-mists,32.0,"Aqua, Cyclohexasiloxane, Squalane, BisPEG18 Me...",943,87617,0.0,1.7,https://www.sephora.com/product/ultra-facial-c...,1.7,18.823529,0


In [11]:
#baseline
products['class'].value_counts(normalize = True)

0    0.445258
1    0.225885
2    0.186813
3    0.142043
Name: class, dtype: float64

### CountVectorizer

In [12]:
X = products['ingredients'] #setting our X variable

y = products['class'] #our y variable is price_per_ounce

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42,  test_size=.33, stratify = y)

In [13]:
#CountVactorizer splits the ingredients by comma and not in to individual words
cvec = CountVectorizer(tokenizer=lambda x: x.split(', '))
#training CountVectorizer and transforming
X_train_cvec = cvec.fit_transform(X_train)
X_test_cvec = cvec.transform(X_test)

In [14]:
#instantiating the models
logreg = LogisticRegression(solver = 'lbfgs', penalty = 'none', multi_class = 'multinomial', max_iter = 1000)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(random_state=42)
bdt = BaggingClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=10)
ada = AdaBoostClassifier()
svc = SVC(gamma = 'scale')


#dictionary of model names and their instantiated name
models = {
    'Logistic Regression': logreg,
    'KNN': knn, 
    'Decision Tree': dt, 
    'Bagging': bdt, 
    'Random Forest': rf, 
    'AdaBoost': ada,
    'SVC': svc
}
#list of model names
model_names = ['Logistic Regression', 'KNN', 'Decision Tree', 'Bagging', 'Random Forest', 'AdaBoost', 'SVC']
#creating a data frame that rmse scores will go in to
f1_all = pd.DataFrame(index = model_names, columns = ['Train f1_score', 'Test f1_score', 'Train Accuracy', 'Test Accuracy'])
#fitting the models and calculating their rmse
for x in models:
    models[x].fit(X_train_cvec, y_train) #fitting the model
    y_train_preds = models[x].predict(X_train_cvec) #prediction for train
    y_test_preds = models[x].predict(X_test_cvec) #prediction for test
    f1_train = f1_score(y_train, y_train_preds, average = 'weighted')
    f1_test = f1_score(y_test, y_test_preds, average = 'weighted')
    #f1_train = 1
    #f1_test = 1
    f1_all.loc[x, 'Train f1_score'] = f1_train #updating the dataframe
    f1_all.loc[x, 'Test f1_score'] = f1_test
    f1_all.loc[x, 'Train Accuracy'] = models[x].score(X_train_cvec, y_train)
    f1_all.loc[x, 'Test Accuracy'] = models[x].score(X_test_cvec, y_test)
f1_all

Unnamed: 0,Train f1_score,Test f1_score,Train Accuracy,Test Accuracy
Logistic Regression,0.87922,0.491651,0.879101,0.493218
KNN,0.522943,0.359809,0.549818,0.385943
Decision Tree,0.876861,0.442332,0.879101,0.448829
Bagging,0.858366,0.46976,0.858445,0.489519
Random Forest,0.866709,0.465346,0.867558,0.493218
AdaBoost,0.545089,0.447762,0.576549,0.490752
SVC,0.771275,0.45593,0.780073,0.508015


### TfidfVectorizer

In [15]:
X = products['ingredients'] #setting our X variable

y = products['class'] #our y variable is price_per_ounce

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42,  test_size=.33)

In [16]:
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(', '))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [17]:
#instantiating the models
logreg = LogisticRegression(solver = 'lbfgs', penalty = 'none', multi_class = 'multinomial', max_iter = 1000)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(random_state=42)
bdt = BaggingClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=10)
ada = AdaBoostClassifier()
svc = SVC(gamma = 'scale')


#dictionary of model names and their instantiated name
models = {
    'Logistic Regression': logreg,
    'KNN': knn, 
    'Decision Tree': dt, 
    'Bagging': bdt, 
    'Random Forest': rf, 
    'AdaBoost': ada,
    'SVC': svc
}
#list of model names
model_names = ['Logistic Regression', 'KNN', 'Decision Tree', 'Bagging', 'Random Forest', 'AdaBoost', 'SVC']
#creating a data frame that rmse scores will go in to
f1_all = pd.DataFrame(index = model_names, columns = ['Train f1_score', 'Test f1_score', 'Train Accuracy', 'Test Accuracy'])
#fitting the models and calculating their rmse
for x in models:
    models[x].fit(X_train_tfidf, y_train) #fitting the model
    y_train_preds = models[x].predict(X_train_tfidf) #prediction for train
    y_test_preds = models[x].predict(X_test_tfidf) #prediction for test
    f1_train = f1_score(y_train, y_train_preds, average = 'weighted')
    f1_test = f1_score(y_test, y_test_preds, average = 'weighted')
    #f1_train = 1
    #f1_test = 1
    f1_all.loc[x, 'Train f1_score'] = f1_train #updating the dataframe
    f1_all.loc[x, 'Test f1_score'] = f1_test
    f1_all.loc[x, 'Train Accuracy'] = models[x].score(X_train_tfidf, y_train)
    f1_all.loc[x, 'Test Accuracy'] = models[x].score(X_test_tfidf, y_test)
f1_all

Unnamed: 0,Train f1_score,Test f1_score,Train Accuracy,Test Accuracy
Logistic Regression,0.883046,0.484287,0.882746,0.487053
KNN,0.628428,0.471725,0.6452,0.480888
Decision Tree,0.880714,0.443324,0.882746,0.454994
Bagging,0.86349,0.471703,0.86452,0.493218
Random Forest,0.869214,0.472788,0.869988,0.49815
AdaBoost,0.523335,0.465053,0.554678,0.500617
SVC,0.861563,0.483665,0.863913,0.525277


## Classification to Regression

In [18]:
class_mean = products.groupby('class')['price_per_ounce'].mean()
class_mean

class
0     10.554055
1     35.503550
2     71.792020
3    191.631568
Name: price_per_ounce, dtype: float64

In [19]:
X = products['ingredients'] #setting our X variable

y = products['class'] #our y variable is price_per_ounce

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42,  test_size=.33)

#splitting up the regression y values
y_train_reg = products.loc[y_train.index, 'price_per_ounce']
y_test_reg = products.loc[y_test.index, 'price_per_ounce']

#initializing Tfidf Vectorizer
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(', '))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [20]:
#instantiating the models
logreg = LogisticRegression(solver = 'lbfgs', penalty = 'none', multi_class = 'multinomial', max_iter = 1000)
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier(random_state=42)
bdt = BaggingClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=10)
ada = AdaBoostClassifier()
svc = SVC(gamma = 'scale')


#dictionary of model names and their instantiated name
models = {
    'Logistic Regression': logreg,
    'KNN': knn, 
    'Decision Tree': dt, 
    'Bagging': bdt, 
    'Random Forest': rf, 
    'AdaBoost': ada,
    'SVC': svc
}
#list of model names
model_names = ['Logistic Regression', 'KNN', 'Decision Tree', 'Bagging', 'Random Forest', 'AdaBoost', 'SVC']
#creating a data frame that rmse scores will go in to
class_to_reg_all = pd.DataFrame(index = model_names, columns = ['Train f1_score',
                                                                'Test f1_score',
                                                                'Train Accuracy',
                                                                'Test Accuracy',
                                                               'RMSE Train score',
                                                               'RMSE Test score',
                                                               'Train R^2',
                                                               'Test R^2'])
#fitting the models and calculating their rmse
for x in models:
    models[x].fit(X_train_tfidf, y_train) #fitting the model
    y_train_preds = models[x].predict(X_train_tfidf) #prediction for train
    y_test_preds = models[x].predict(X_test_tfidf) #prediction for test
    f1_train = f1_score(y_train, y_train_preds, average = 'weighted')
    f1_test = f1_score(y_test, y_test_preds, average = 'weighted')
    class_to_reg_all.loc[x, 'Train f1_score'] = f1_train #updating the dataframe
    class_to_reg_all.loc[x, 'Test f1_score'] = f1_test
    class_to_reg_all.loc[x, 'Train Accuracy'] = models[x].score(X_train_tfidf, y_train)
    class_to_reg_all.loc[x, 'Test Accuracy'] = models[x].score(X_test_tfidf, y_test)
    
    #converting to regression
    #train variables
    y_train_preds = y_train_preds.astype(float)
    for index, i in enumerate(y_train_preds):
        if i == 0:
            y_train_preds[index] = class_mean[0]
        elif i == 1:
            y_train_preds[index] = class_mean[1]
        elif i == 2:
            y_train_preds[index] = class_mean[2]
        else:
            y_train_preds[index] = class_mean[3]
            
    #test variables
    y_test_preds = y_test_preds.astype(float)
    for index, i in enumerate(y_test_preds):
        if i == 0:
            y_test_preds[index] = class_mean[0]
        elif i == 1:
            y_test_preds[index] = class_mean[1]
        elif i == 2:
            y_test_preds[index] = class_mean[2]
        else:
            y_test_preds[index] = class_mean[3]
             
    class_to_reg_all.loc[x, 'RMSE Train score'] = np.sqrt(mean_squared_error(y_train_reg, y_train_preds))
    class_to_reg_all.loc[x, 'RMSE Test score'] = np.sqrt(mean_squared_error(y_test_reg, y_test_preds))
    class_to_reg_all.loc[x, "Train R^2"] = r2_score(y_train_reg, y_train_preds)
    class_to_reg_all.loc[x, 'Test R^2'] = r2_score(y_test_reg, y_test_preds)
    
class_to_reg_all

Unnamed: 0,Train f1_score,Test f1_score,Train Accuracy,Test Accuracy,RMSE Train score,RMSE Test score,Train R^2,Test R^2
Logistic Regression,0.883046,0.484287,0.882746,0.487053,57.484,75.3706,0.453278,-0.127419
KNN,0.628428,0.471725,0.6452,0.480888,71.1058,74.5147,0.163468,-0.101958
Decision Tree,0.880714,0.443324,0.882746,0.454994,57.5388,77.0081,0.452236,-0.176937
Bagging,0.86349,0.471703,0.86452,0.493218,58.8081,76.4536,0.427801,-0.16005
Random Forest,0.869367,0.467427,0.869988,0.496917,57.816,72.4241,0.446944,-0.0409926
AdaBoost,0.523335,0.465053,0.554678,0.500617,73.1709,73.8924,0.114172,-0.0836301
SVC,0.861563,0.483665,0.863913,0.525277,57.7854,73.7815,0.44753,-0.0803783


In [21]:
rmse_all_cvec

Unnamed: 0,Train,Test,Train R^2,Test R^2
Linear Regression,33.4644,94.3517,0.814715,-0.766772
KNN,66.4481,73.7517,0.26947,-0.0795064
Decision Tree,33.4227,84.807,0.815178,-0.427397
Bagging,41.4883,67.2987,0.71521,0.101135
Random Forest,40.1477,72.6716,0.733317,-0.0481196
AdaBoost,75.1839,81.5677,0.0647611,-0.320436
SVR,80.4214,72.8766,-0.0700781,-0.0540414
