# Model Training

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# from exception import CustomException
import sys

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  ( StandardScaler , OrdinalEncoder , OneHotEncoder , LabelEncoder)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings

In [25]:
df = pd.read_csv('data/diamonds.csv')
df.drop(columns='Unnamed: 0',inplace=True)

In [26]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)
#Dropping dimentionless diamonds
df = df.drop(df[df["x"]==0].index)
df = df.drop(df[df["y"]==0].index)
df = df.drop(df[df["z"]==0].index)
df.shape

(53775, 10)

In [27]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [28]:
sample_df = df.sample(5605)

In [29]:
# segregating dependent and independent features
X = sample_df.drop(columns='price')
y = sample_df['price']
X,y

(       carat        cut color clarity  depth  table     x     y     z
 14984   1.01       Good     F     VS2   64.1   59.0  6.16  6.23  3.97
 21978   1.00  Very Good     F      IF   63.2   63.0  6.26  6.24  3.95
 47591   0.51  Very Good     D     VS2   60.1   59.0  5.16  5.19  3.11
 19353   0.40      Ideal     G     SI2   61.9   54.0  4.75  4.80  2.94
 32032   0.44    Premium     E     SI1   62.3   58.0  4.85  4.88  3.03
 ...      ...        ...   ...     ...    ...    ...   ...   ...   ...
 20389   0.31    Premium     F     VS2   62.4   58.0  4.31  4.34  2.70
 24990   2.33  Very Good     H     SI2   63.2   56.0  8.44  8.35  5.31
 44970   0.32      Ideal     H     VS2   60.8   56.0  4.46  4.49  2.72
 6905    1.04      Ideal     I     SI1   63.0   56.0  6.49  6.39  4.06
 25648   2.51    Premium     H     SI2   59.3   61.0  8.81  8.77  5.21
 
 [5605 rows x 9 columns],
 14984     6037
 21978    10046
 47591     1877
 19353      622
 32032      778
          ...  
 20389      625
 24990  

In [30]:
X


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
14984,1.01,Good,F,VS2,64.1,59.0,6.16,6.23,3.97
21978,1.00,Very Good,F,IF,63.2,63.0,6.26,6.24,3.95
47591,0.51,Very Good,D,VS2,60.1,59.0,5.16,5.19,3.11
19353,0.40,Ideal,G,SI2,61.9,54.0,4.75,4.80,2.94
32032,0.44,Premium,E,SI1,62.3,58.0,4.85,4.88,3.03
...,...,...,...,...,...,...,...,...,...
20389,0.31,Premium,F,VS2,62.4,58.0,4.31,4.34,2.70
24990,2.33,Very Good,H,SI2,63.2,56.0,8.44,8.35,5.31
44970,0.32,Ideal,H,VS2,60.8,56.0,4.46,4.49,2.72
6905,1.04,Ideal,I,SI1,63.0,56.0,6.49,6.39,4.06


In [7]:
num_cols = X.select_dtypes(exclude='object').columns
ordinal_cols = X[['cut','clarity']]
nominal_col = X.color
num_cols

Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')

In [8]:
num_pipeline = Pipeline(
    steps=[
    ('standardScaler',StandardScaler()),
    ('imputer',SimpleImputer(strategy='median'))
]
)

In [9]:
preprocessor_col = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_cols),
        ('cut_col_pipeline', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal_encoder', OrdinalEncoder(categories=[['Good', 'Very Good', 'Fair', 'Ideal', 'Premium']]))
        ]), ['cut']),
        ('clarity_col_pipeline', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ordinal_encoder', OrdinalEncoder(categories=[['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']]))
        ]), ['clarity']),
        ('color_col_pipeline', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('ohe', OneHotEncoder())
        ]), ['color'])
    ]
)

X = preprocessor_col.fit_transform(X)



In [10]:
preprocessor_col

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=42)

In [12]:
X_train.shape,y_train.shape,X_test.shape

((4203, 15), (4203,), (1402, 15))

In [13]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [14]:
# Evaluate model function
def evaluate_models(true,predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [15]:
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_models(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_models(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*50)
    print('\n'*2)

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1185.7272
- Mean Absolute Error: 788.1324
- R2 Score: 0.9128
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1163.4021
- Mean Absolute Error: 751.6337
- R2 Score: 0.9079



Lasso
Model performance for Training set
- Root Mean Squared Error: 1190.9178
- Mean Absolute Error: 786.5715
- R2 Score: 0.9120
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1167.1169
- Mean Absolute Error: 750.2876
- R2 Score: 0.9074



Ridge
Model performance for Training set
- Root Mean Squared Error: 1187.1601
- Mean Absolute Error: 787.6522
- R2 Score: 0.9126
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1164.1660
- Mean Absolute Error: 751.1940
- R2 Score: 0.9078





  model = cd_fast.enet_coordinate_descent(


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 788.3100
- Mean Absolute Error: 440.5150
- R2 Score: 0.9615
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 928.7661
- Mean Absolute Error: 521.7532
- R2 Score: 0.9413



Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 950.3827
- Mean Absolute Error: 476.2475
- R2 Score: 0.9386



Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 256.7318
- Mean Absolute Error: 131.6958
- R2 Score: 0.9959
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 662.5472
- Mean Absolute Error: 337.0676
- R2 Score: 0.9701



XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 155.3351
- Mean Absolute Error: 105

In [16]:
list(zip(model_list,r2_list))

[('Linear Regression', 0.9079454902336906),
 ('Lasso', 0.9073566865650853),
 ('Ridge', 0.9078245671928814),
 ('K-Neighbors Regressor', 0.9413324370186757),
 ('Decision Tree', 0.938569726523452),
 ('Random Forest Regressor', 0.9701448650897903),
 ('XGBRegressor', 0.9694755970970167),
 ('AdaBoost Regressor', 0.9069342010914716)]

In [17]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['model_name','r2_score']).sort_values(by=['r2_score'],ascending=False)

Unnamed: 0,model_name,r2_score
5,Random Forest Regressor,0.970145
6,XGBRegressor,0.969476
3,K-Neighbors Regressor,0.941332
4,Decision Tree,0.93857
0,Linear Regression,0.907945
2,Ridge,0.907825
1,Lasso,0.907357
7,AdaBoost Regressor,0.906934


In [18]:
from src.utils import evaluate_models as ems

def evaluate_models(X_train,y_train,X_test,y_test,models,param=None) -> dict:

    try:
        report = {}

        for i in range(len(list(models))):
            model = list(models.values())[i]

            model.fit(X_train,y_train)

            y_train_pred = model.predict(X_train)

            y_test_pred = model.predict(X_test)

            # train_model_score = r2_score(y_pred=y_train_pred,y_true=y_train)

            test_model_score = r2_score(y_true=y_test,y_pred=y_test_pred)

            report[list(models.keys())[i]] = test_model_score

        return report
    except Exception as e:
        print(e)
        



In [19]:
best_model = ems(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,models=models)

  model = cd_fast.enet_coordinate_descent(


In [20]:
max(sorted(best_model.values()))

0.9696110688670126

In [21]:
max(best_model)

'XGBRegressor'

In [22]:
best_model

{'Linear Regression': 0.9079454902336906,
 'Lasso': 0.9073566865650853,
 'Ridge': 0.9078245671928814,
 'K-Neighbors Regressor': 0.9413324370186757,
 'Decision Tree': 0.9372616122109685,
 'Random Forest Regressor': 0.9696110688670126,
 'XGBRegressor': 0.9694755970970167,
 'AdaBoost Regressor': 0.8976323500692369}