# Reduce the time a Mercedes-Benz spends on the test bench.

### Problem Statement Scenario:
     Since the first automobile, the Benz Patent Motor Car in 1886, Mercedes-Benz has stood for important automotive innovations. These include the passenger safety cell with a crumple zone, the airbag, and intelligent assistance systems. Mercedes-Benz applies for nearly 2000 patents per year, making the brand the European leader among premium carmakers. Mercedes-Benz is the leader in the premium car industry. With a huge selection of features and options, customers can choose the customized Mercedes-Benz of their dreams.
     
     To ensure the safety and reliability of every unique car configuration before they hit the road, the company’s engineers have developed a robust testing system. As one of the world’s biggest manufacturers of premium cars, safety and efficiency are paramount on Mercedes-Benz’s production lines. However, optimizing the speed of their testing system for many possible feature combinations is complex and time-consuming without a powerful algorithmic approach.
     
      You are required to reduce the time that cars spend on the test bench. Others will work with a dataset representing different permutations of features in a Mercedes-Benz car to predict the time it takes to pass testing. Optimal algorithms will contribute to faster testing, resulting in lower carbon dioxide emissions without reducing Mercedes-Benz’s standards.
      
   ##### Following actions should be performed:

     1.If for any column(s), the variance is equal to zero, then you need to remove those variable(s).
     2.Check for null and unique values for test and train sets.
     3.apply label encoder.
     4.Perform dimensionality reduction.
     5.Predict your test_df values using XGBoost.

In [49]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from math import sqrt


In [50]:
train_data= pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

In [51]:
train_data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [52]:
test_data.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


### DATA PREPARATION :  Drop column ID as this is not relevant to prediction

In [53]:
train_data=train_data.drop(['ID'],axis=1)
test_data=test_data.drop(['ID'],axis=1)

In [54]:
train_data.head()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
test_data.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,az,v,n,f,d,t,a,w,0,0,...,0,0,0,1,0,0,0,0,0,0
1,t,b,ai,a,d,b,g,y,0,0,...,0,0,1,0,0,0,0,0,0,0
2,az,v,as,f,d,a,j,j,0,0,...,0,0,0,1,0,0,0,0,0,0
3,az,l,n,f,d,z,l,n,0,0,...,0,0,0,1,0,0,0,0,0,0
4,w,s,as,c,d,y,i,m,0,0,...,1,0,0,0,0,0,0,0,0,0


## TASK-1.If for any column(s), the variance is equal to zero, then you need to remove those variable(s). 

In [56]:
columns_with_zero_var=train_data.var()[train_data.var()==0].index.values
columns_with_zero_var

  columns_with_zero_var=train_data.var()[train_data.var()==0].index.values


array(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290',
       'X293', 'X297', 'X330', 'X347'], dtype=object)

In [58]:
train_data=train_data.drop(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290',
       'X293', 'X297', 'X330', 'X347'],axis=1)
test_data=test_data.drop(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290',
       'X293', 'X297', 'X330', 'X347'],axis=1)

In [59]:
train_data.shape

(4209, 365)

In [60]:
test_data.shape

(4209, 364)

## TASK 2. Check for null and unique values for test and train sets.

In [61]:
## Check the null values in train and test dataset.
print('sum of null values in train data',np.sum(train_data.isnull().sum()))
print('sum of null values in test data',np.sum(test_data.isnull().sum()))

sum of null values in train data 0
sum of null values in test data 0


In [62]:
## Check the unique values in train and test dataset.
print(' unique values in train data',np.sum(train_data.nunique().sum()))
print(' unique values in test data',np.sum(test_data.nunique().sum()))

 unique values in train data 3452
 unique values in test data 908


###### Both train and test dataset does not contain any null values. Whereas, they got 7661 and 5117 unique values respectively.

## TASK 3. Apply Label Encoder

In [63]:
# find the columns having datatype as object
object_columns=train_data.describe(include=[object]).columns.values
object_columns

array(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype=object)

In [64]:
le=LabelEncoder()
for col in object_columns:
    le.fit(train_data[col].append(test_data[col]).values)
    train_data[col]=le.transform(train_data[col])
    test_data[col]=le.transform(test_data[col])

  le.fit(train_data[col].append(test_data[col]).values)


In [65]:
train_data.head()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,37,23,20,0,3,27,9,14,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,37,21,22,4,3,31,11,14,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,24,24,38,2,3,30,9,23,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,24,21,38,5,3,30,11,4,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,24,23,38,5,3,14,3,13,0,...,0,0,0,0,0,0,0,0,0,0



# TASK-4.Perform dimensionality reduction.

In [66]:
## Create X and Y
X=train_data.drop(['y'],axis=1)
y=train_data.y

## Create train and test split
from sklearn import model_selection
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.3,random_state=1)

In [67]:
from sklearn.decomposition import PCA as sklearnPCA
pca=sklearnPCA(0.98,svd_solver='full')
sklearn_PCA=pca.fit(X)

In [68]:
sklearn_PCA.n_components_

12

 Using PCA ,decomposition of data i.e.,364 components with 98% variance to project
it to a lower dimensional space is 1 components

In [69]:
sklearn_PCA.explained_variance_ratio_

array([0.40868988, 0.21758508, 0.13120081, 0.10783522, 0.08165248,
       0.0140934 , 0.00660951, 0.00384659, 0.00260289, 0.00214378,
       0.00209857, 0.00180388])

## TASK 5.Predict your test_df values using XGBoost.

In [70]:
pca_X_train=pd.DataFrame(sklearn_PCA.transform(X_train))
pca_X_test=pd.DataFrame(sklearn_PCA.transform(X_test))
pca_df_test_data=pd.DataFrame(sklearn_PCA.transform(test_data))


In [71]:
!pip3 install xgboost

Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1


In [72]:
import xgboost as xgb
xgb_regressor_model=xgb.XGBRegressor(objective = 'reg:squarederror',learning_rate=0.1)
xgb_regressor_model.fit(pca_X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [76]:
xgb_rebregressor_model_predicted_y_test=xgb_regressor_model.predict(pca_X_test)
print(sqrt(mean_squared_error(y_test,xgb_rebregressor_model_predicted_y_test)))

8.555776313214766


In [77]:
xgb_RFregressor_model=xgb.XGBRFRegressor(objective='reg:squarederror',learning_rate=1)
xgb_RFregressor_model.fit(pca_X_train,y_train)


XGBRFRegressor(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bytree=1,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=1, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
               missing=nan, monotone_constraints='()', n_estimators=100,
               n_jobs=0, num_parallel_tree=100, objective='reg:squarederror',
               predictor='auto', random_state=0, reg_alpha=0,
               sampling_method='uniform', ...)

In [81]:
xgb_RFregressor_model_predicted_y_test=xgb_RFregressor_model.predict(pca_X_test)
print(sqrt(mean_squared_error(y_test,xgb_RFrebregressor_model_predicted_y_test)))

9.064875902434391
