In [None]:
from google.colab import drive, files
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.model_selection import cross_val_score

from sklearn import metrics 


try:
  url = 'https://drive.google.com/file/d/1CoTEepAbFPR1c1V4xkaYKZB4l85teqQk/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
  initial_df = pd.read_csv(path, low_memory=False, index_col=0, header=None).T
except:
  uploaded = files.upload()
  initial_df = pd.read_csv("fullspecs.csv" ,low_memory=False, index_col=0, header=None).T


#**Cleaning**

In [None]:
# Creation of a New DataFrame
df = pd.DataFrame()
# Price is the Manufacturer's Suggested Retail Price without $ and ,
df['Price'] = initial_df['MSRP'].copy().replace(r'\D+', '', regex=True)

# Extraction of the Horsepower, and the Torque
df['Horsepower'] = initial_df['SAE Net Horsepower @ RPM'].str.split('@').str[1]
df['Torque']     = initial_df['SAE Net Torque @ RPM'].str.split('@').str[1]
df['Horsepower'] = pd.to_numeric(df['Horsepower'], errors="coerce")
df['Torque']     = pd.to_numeric(df['Torque'], errors="coerce")

# Get Height in number
df['Height']     = pd.to_numeric(initial_df['Height, Overall (in)'], errors="coerce")

# Get the Style (it's not used so it's useless)
df['Style'] = initial_df['Style Name']

# Get the name of every car by taking the values in the first row
df['Model'] = initial_df.iloc[:,0]

# One hot encoding for Body style
df_enc=pd.get_dummies(initial_df["Body Style"], prefix="Style")
df=pd.concat([df, df_enc], axis=1)
df['Camera'] = initial_df['Back-Up Camera'].replace(['Yes','No'],[1,0])

# BIG PROBLEM SOLVED
# Horse power couldn't fit in my column so i dropped it and keeped only NA
df = df[df['Horsepower'].notna() & df['Torque'].notna() & df['Price'].notna() & df['Height'].notna()]

# Conversion of each data in the right type
df['Price']              = df['Price'].astype(float)
df['Height']             = df['Height'].astype(float)
df['Passenger Capacity'] = initial_df['Passenger Capacity'].astype(int)
df['Passenger Doors']    = initial_df['Passenger Doors'].astype(int)
df['Horsepower']         = df['Horsepower'].astype(int)
df['Torque']             = df['Torque'].astype(int)

# Sort by price
df.sort_values('Price', inplace=True)
df.reset_index(inplace=True, drop=True)

#**Xgboost**

##First config

In [None]:
# Selected feature for training and target
feature = ['Passenger Capacity', 'Passenger Doors','Horsepower','Torque','Camera', 'Height']
X, y = df[feature], df['Price']

# Split of our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
import xgboost as xgb

# Parameters used for Grid Search
param_grid =[{
              'booster'      : ['gbtree', 'gblinear', 'dart'],
              'objective'    : ["reg:squarederror"],
              'n_estimators' : [10, 20],
              'max_depth'    : [25, 30],
              'learning_rate': [0.68, 0.69],
              'gamma'        : [0.95, 1, 1.05]}]

# Creation of our XGB/Grid Model
rgr = GridSearchCV(xgb.XGBRegressor(), param_grid, refit=True, verbose=3)

# Train It
rgr.fit(X_train, y_train)
# Predict the y (Price) of our cars
y_pred = rgr.predict(X_test)

# Calculate our metrics and print them
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
mse = metrics.mean_squared_error(y_test, y_pred)

print(rgr.best_params_)
print(f"mse = {mse}")
print(f"rmse = {rmse}")
r2 = metrics.r2_score(y_test, y_pred)
print(f"r2 = {r2}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror, score=0.893, total=   0.7s
[CV] booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV]  booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror, score=0.906, total=   0.6s
[CV] booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV]  booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror, score=0.904, total=   0.6s
[CV] booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror 
[CV]  booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror, score=0.933, total=   0.6s
[CV] booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror 
[CV]  booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=10, objective=reg:squarederror, score=0.892, total=   0.6s
[CV] booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=20, objective=reg:squarederror 
[CV]  booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=20, objective=reg:squarederror, score=0.892, total=   1.1s
[CV] booster=gbtree, gamma=0.95, learning_rate=0.68, max_depth=25, n_estimators=20, objective=reg:square

KeyboardInterrupt: ignored

In [None]:
rgr.best_estimator_.fit(X_train, y_train)

y_pred = rgr.best_estimator_.predict(X_test)


rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
mse = metrics.mean_squared_error(y_test, y_pred)
print(rgr.best_params_)

print(f"mse = {mse}")
print(f"rmse = {rmse}")
r2 = metrics.r2_score(y_test, y_pred)
print(f"r2 = {r2}")

AttributeError: ignored

##Second config

In [None]:
# Selected feature for training and target
feature = ['Passenger Capacity', 'Passenger Doors','Horsepower','Torque', 'Camera', 'Height' ] + list(df_enc)
X, y = df[feature], df['Price']

# Split of our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
import xgboost as xgb

param_grid =[{
'booster': ['gbtree','gblinear','dart'],
'objective': ["reg:squarederror"],
'n_estimators': [10, 20],
'max_depth': [25, 30],
'learning_rate':[0.68,0.69],
'gamma':[0.95,1, 1.05]}]

kfold = KFold(n_splits=5, random_state=0)

clf = GridSearchCV(xgb.XGBRegressor(), param_grid, refit=True, verbose=3, cv=kfold)
score = cross_val_score(clf, X, y, cv=kfold)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
mse = metrics.mean_squared_error(y_test, y_pred)


print(clf.best_params_)
print(f"mse = {mse}")
print(f"rmse = {rmse}")
r2 = metrics.r2_score(y_test, y_pred)
print(f"r2 = {r2}")
print(score)

#**Decision Tree (Not the best for regression)**



In [None]:
# Selected feature for training and target
feature = ['Passenger Capacity', 'Passenger Doors','Horsepower','Torque']
X, y = df[feature], df['Price']

# Split of our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## First config

In [None]:
from sklearn.tree import DecisionTreeClassifier

param_grid =[{
              'criterion': ['entropy', 'gini'],
              'max_depth': [15, 18, 25, 30],
              'max_leaf_nodes':[1100, 1200, 1300]}]
              
rgr = GridSearchCV(DecisionTreeClassifier(),param_grid,refit=True,verbose=3)

rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)
    

#Predict the response for test dataset
y_pred = rgr.predict(X_test)
r2 = metrics.r2_score(y_test, y_pred)
print(f"r2 = {r2}")

In [None]:
print("Accuracy est :",metrics.accuracy_score(y_test, y_pred))
print("precision est :",metrics.precision_score(y_test, y_pred, average='micro'))
print("recall est :",metrics.recall_score(y_test, y_pred, average='micro'))

##Second config

In [None]:
# Selected feature for training and target
feature = ['Passenger Capacity', 'Passenger Doors','Horsepower','Torque'] + list(df_enc)
X, y = df[feature], df['Price']

# Split of our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
param_grid =[{
              'criterion': ['entropy', 'gini'],
              'max_depth': [10, 18, 30, 50],
              'max_leaf_nodes':[800, 1200, 2000, 2300]}]
              

rgr = GridSearchCV(DecisionTreeClassifier(),param_grid,refit=True,verbose=3)
# Train Decision Tree Classifer
rgr = rgr.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = rgr.predict(X_test)
print(rgr.best_params_)

In [None]:
print("Accuracy est :",metrics.accuracy_score(y_test, y_pred))
print("precision est :",metrics.precision_score(y_test, y_pred, average='micro'))
print("recall est :",metrics.recall_score(y_test, y_pred, average='micro'))

#Perceptron

In [None]:
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron

rgr = Perceptron(tol=1e-3, random_state=0)
rgr = rgr.fit(X_train,y_train)

y_pred = rgr.predict(X_test)

r2 = metrics.r2_score(y_test, y_pred)
print(f"r2 = {r2}")