In [31]:
## Importing the Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

In [32]:
## Importing the Dataset

cars = pd.read_csv(r"C:\Users\KRISHNA\Downloads\cars.csv")
cars.head(10)

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945,33337,3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820,21761,2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,26990,24647,2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,33195,30299,3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755,39014,3.5,6.0,225,18,24,3880,115,197
5,Acura,3.5 RL w/Navigation 4dr,Sedan,Asia,Front,46100,41100,3.5,6.0,225,18,24,3893,115,197
6,Acura,NSX coupe 2dr manual S,Sports,Asia,Rear,89765,79978,3.2,6.0,290,17,24,3153,100,174
7,Audi,A4 1.8T 4dr,Sedan,Europe,Front,25940,23508,1.8,4.0,170,22,31,3252,104,179
8,Audi,A41.8T convertible 2dr,Sedan,Europe,Front,35940,32506,1.8,4.0,170,23,30,3638,105,180
9,Audi,A4 3.0 4dr,Sedan,Europe,Front,31840,28846,3.0,6.0,220,20,28,3462,104,179


## 1) Preprocessing

In [33]:
## Getting the counts of the car models

cars[['Model']].value_counts().head()

Model                
C320 4dr                 2
C240 4dr                 2
G35 4dr                  2
3.5 RL 4dr               1
Passat W8 4MOTION 4dr    1
dtype: int64

In [None]:
## Getting the datatypes

cars.dtypes

Make            object
Model           object
Type            object
Origin          object
DriveTrain      object
MSRP             int64
Invoice          int64
EngineSize     float64
Cylinders      float64
Horsepower       int64
MPG_City         int64
MPG_Highway      int64
Weight           int64
Wheelbase        int64
Length           int64
dtype: object

In [None]:
## Creating a new column 'Convertible' through data extraction from column 'Model'


cars['Convertible'] = cars['Model'].apply(lambda x:1 if 'convertible' in x else 0)

In [None]:
## Creating a new column 'Doors_2' through data extraction from column 'Model'


cars['Doors_2'] = cars['Model'].apply(lambda x:1 if  '2dr' in x else 0)

In [None]:
## Extracting only the first word from every entry of Model Column

cars['Model Name'] = cars['Model'].apply(lambda x:" ".join(x.split()[0:1]))

In [None]:
## Checking for missing values

cars.isnull().sum()

In [None]:
## Checking the dimensions

cars.shape

In [None]:
## Checking the skewness of the data

cars.skew()

In [None]:
## Filling out the missing values 

cars['Cylinders'] = cars['Cylinders'].fillna(cars['Cylinders'].mode()[0])

In [None]:
## No Null Values Present

cars.isnull().sum()

## 2) EDA

In [None]:
## The Report shows that there are outliers present in all of the numerical columns

In [None]:
## Also using Boxplot to display outliers

plt.rcParams['figure.figsize'] = [10, 11.0]
plt.rcParams['figure.autolayout'] = True

ax =cars[['MSRP','Invoice','EngineSize','Cylinders', 'Horsepower', 'MPG_City','MPG_Highway','Weight','Wheelbase','Length']].plot(kind='box',title='boxplots')

plt.show()

In [None]:
## Checking the datatypes in order to change the datatypes to thier native formats

cars.dtypes

In [None]:
## Changing the DataType of the following Column

cars['Cylinders'] = cars['Cylinders'].astype('int64')

In [None]:
## Capping the Outliers from the numerical columns. As all the numerical columns are skewed. So IQR has been used to cap all the outliers


num_col = ['MSRP', 'Invoice','EngineSize','Cylinders','Horsepower','MPG_City','MPG_Highway','Weight','Wheelbase','Length']

for col in num_col:
    q1 = cars[col].quantile(0.25)
    q3 = cars[col].quantile(0.75)
    
    iqr = q3 - q1 
    
    upper_limit = q3 + 1.5 * iqr
    lower_limit = q1 - 1.5 * iqr
    
    cars[col] = np.where(cars[col] > upper_limit,upper_limit,
                        np.where(cars[col] < lower_limit,lower_limit, cars[col]))

In [None]:
## Removed outliers can be seen using the boxplot again


plt.rcParams['figure.figsize'] = [10, 11.0]
plt.rcParams['figure.autolayout'] = True

ax =cars[['MSRP','Invoice','EngineSize','Cylinders', 'Horsepower', 'MPG_City','MPG_Highway','Weight','Wheelbase','Length']].plot(kind='box',title='boxplots')

plt.show()

In [None]:
## Dropping unnecessary columns

cars_df = cars.drop(columns=['Model','Invoice'])
cars_df.head(10)

In [None]:
## Changing the datatypes of the numerical columns to integer


num_col1 = ['Cylinders','Horsepower','MPG_City','MPG_Highway','Weight','Wheelbase','Length','MSRP']

for col in num_col1:
    cars_df[col]=cars_df[col].astype('int64')

In [None]:
cars_df.dtypes

In [None]:
## Transferring this Preproceesed data to a new variable in order to prevent any losses

cars_df.to_csv('cars cleaned dataset.csv')

In [None]:
car_final = pd.read_csv('cars cleaned dataset.csv')
car_final.head()

In [None]:
## Creating a new dataframe

car_final = car_final.drop(columns = ['Unnamed: 0'])
car_final.head()

In [None]:
## Getting information about the data
## All the values are accounted for


car_final.info()

## Model Building

In [None]:
## Creating a x and y variables

y = car_final[['MSRP']]

x =car_final.drop(columns = ['MSRP'])

In [None]:
## Creating train and test datasets from x and y using train-test-split

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.8,random_state=0)

In [None]:
## Importing the Libraries for model building

from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.metrics import accuracy_score

from sklearn import metrics

In [None]:
## Fitting the model using pipeline and getting the R-SQ value for the model


step1 = ColumnTransformer(transformers=[('col_t',OneHotEncoder(handle_unknown='ignore',sparse=False),[0,1,2,3,14])])

step2 = RandomForestRegressor(n_estimators=500,random_state=42,max_depth=50)

pipe4 = Pipeline([('step1',step1),('step2',step2)])

pipe4.fit(x,y)

y_pred = pipe4.predict(x_test)

print('R2 score',r2_score(y_test,y_pred) *100 )
print('MAE',mean_absolute_error(y_test,y_pred))



In [None]:
## Using Pickle to dump the model

import pickle

pickle.dump(car_final,open('Df.pkl','wb'))
pickle.dump(pipe4,open('Model1.pkl','wb'))