In [1]:
# ! pip install --upgrade scikit-learn --user
# ! pip install missingno --user

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Data handling
import numpy as np
import pandas as pd
import math

# Modelling Algorithms
## Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

## Regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor 
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

# Modelling Helpers
from sklearn.impute import SimpleImputer as Imputer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, Normalizer, scale

# Evaluation metrics
## Regression
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error 

## Classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  

# Visualization
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# import matplotlib.pylab as pylab
# import seaborn as sns
# import missingno as msno
# from IPython.core.display import HTML

# Import the data
df = pd.read_csv('CSV/diamonds.csv')
diamonds = df.copy()

# Format data / prepare for use
## Drop Unnamed:0 column
diamonds = diamonds.drop(columns="Unnamed: 0")

# Remove rows with dimensions of '0'
diamonds = diamonds[(diamonds[['x','y','z']] != 0).all(axis=1)]

# Create new column called Volume
diamonds['volume'] = diamonds['x']*diamonds['y']*diamonds['z']

# Scale and Train
diamonds.drop(['x','y','z'], axis=1, inplace=True)
label_cut = LabelEncoder()
label_color = LabelEncoder()
label_clarity = LabelEncoder()

diamonds['cut'] = label_cut.fit_transform(diamonds['cut'])
diamonds['color'] = label_color.fit_transform(diamonds['color'])
diamonds['clarity'] = label_clarity.fit_transform(diamonds['clarity'])

# Split the data into train and test.
X = diamonds.drop(['price'], axis=1)
y = diamonds['price']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=66)

# Applying Feature Scaling ( StandardScaler )
# You can also Apply MinMaxScaler.
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Collect all R2 Scores.
R2_Scores = []
models = ['Linear Regression', 'Lasso Regression', 'AdaBoost Regression', 
          'Ridge Regression', 'GradientBoosting Regression',
          'RandomForest Regression', 'KNeighbours Regression']


<h2>Regression Models</h2>

In [2]:
def return_r2(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)**0.5
    r2 = r2_score(y_test, y_pred)
    return r2, mse, mae, rmse

""" Linear Regression """
clf_lr = LinearRegression()
clf_lr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_lr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_lr.predict(X_test)

# Use function to return r2
r2, mse, mae, rmse = return_r2(y_test, y_pred)

# Append to R2_Scores
R2_Scores.append(r2)

""" Lasso Regression """
clf_la = Lasso()
clf_la.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_la, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_la.predict(X_test)

# Use function to return r2
r2, mse, mae, rmse = return_r2(y_test, y_pred)

# Append to R2_Scores
R2_Scores.append(r2)

""" AdaBoostRegressor """
clf_ar = AdaBoostRegressor(n_estimators=1000)
clf_ar.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_ar, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_ar.predict(X_test)

# Use function to return r2
r2, mse, mae, rmse = return_r2(y_test, y_pred)

# Append to R2_Scores
R2_Scores.append(r2)

""" Ridge Regression """
clf_rr = Ridge()
clf_rr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rr.predict(X_test)

# Use function to return r2
r2, mse, mae, rmse = return_r2(y_test, y_pred)

# Append to R2_Scores
R2_Scores.append(r2)

""" Gradient Boosting Regression """
clf_gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='squared_error',verbose = 1)
clf_gbr.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_gbr, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_gbr.predict(X_test)

# Use function to return r2
r2, mse, mae, rmse = return_r2(y_test, y_pred)

# Append to R2_Scores
R2_Scores.append(r2)

""" Random Forest """
clf_rf = RandomForestRegressor()
clf_rf.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_rf, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_rf.predict(X_test)

# Fine Tune Random Forest
no_of_test=[100]
params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':["auto",'sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='r2')
clf_rf.fit(X_train,y_train)

pred=clf_rf.predict(X_test)

# Use function to return r2
r2, mse, mae, rmse = return_r2(y_test, pred)

# Append to R2_Scores
R2_Scores.append(r2)

""" KNeighbors Regression """
clf_knn = KNeighborsRegressor()
clf_knn.fit(X_train , y_train)
accuracies = cross_val_score(estimator = clf_knn, X = X_train, y = y_train, cv = 5,verbose = 1)
y_pred = clf_knn.predict(X_test)

# Fine Tune KNeighbors
n_neighbors=[]
for i in range (0,50,5):
    if(i!=0):
        n_neighbors.append(i)
params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]}
clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='r2')
clf_knn.fit(X_train,y_train)

pred=clf_knn.predict(X_test)

# Use function to return r2
r2, mse, mae, rmse = return_r2(y_test, pred)

# Append to R2_Scores
R2_Scores.append(r2)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


      Iter       Train Loss   Remaining Time 
         1    14009477.5296            1.19s
         2    12437807.7359            1.08s
         3    11113339.5845            1.03s
         4     9945244.2308            1.01s
         5     8973416.9156            1.01s
         6     8109014.7842            0.99s
         7     7387120.0500            0.97s
         8     6753937.9878            0.97s
         9     6197182.6819            0.95s
        10     5724689.0901            0.94s
        20     3200362.4597            0.84s
        30     2393542.3170            0.73s
        40     2102586.3335            0.62s
        50     1923964.9187            0.52s
        60     1790574.6006            0.41s
        70     1688380.2826            0.31s
        80     1609829.0076            0.21s
        90     1548089.0039            0.10s
       100     1499127.4566            0.00s
      Iter       Train Loss   Remaining Time 
         1    13994442.1962            0.89s
        

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


        20     3206145.1577            0.67s
        30     2394369.2846            0.60s
        40     2101114.6326            0.51s
        50     1921108.4005            0.42s
        60     1785959.4111            0.34s
        70     1683385.7302            0.26s
        80     1604163.5538            0.17s
        90     1542370.2912            0.09s
       100     1493476.7608            0.00s
      Iter       Train Loss   Remaining Time 
         1    14044115.9884            0.89s
         2    12472837.6750            0.88s
         3    11137657.6396            0.84s
         4     9974212.6419            0.82s
         5     8994369.5031            0.82s
         6     8133396.8459            0.81s
         7     7407925.9669            0.80s
         8     6764110.5537            0.79s
         9     6215416.1793            0.79s
        10     5736700.1166            0.77s
        20     3210108.0310            0.68s
        30     2402276.2056            0.60s
        4

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   48.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.1s finished


In [3]:
# Evaluate
# Visualizing R2-Score of Algorithms
compare = pd.DataFrame({'Algorithms' : models , 'R2-Scores' : R2_Scores})
compare.sort_values(by='R2-Scores' ,ascending=False)

Unnamed: 0,Algorithms,R2-Scores
5,RandomForest Regression,0.978909
6,KNeighbours Regression,0.959028
2,AdaBoost Regression,0.906078
4,GradientBoosting Regression,0.905833
3,Ridge Regression,0.881433
0,Linear Regression,0.881432
1,Lasso Regression,0.881431
