In [1]:
import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
dataset = pd.read_csv("data/dataset.csv")


In [3]:
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], 
                                                    dataset.iloc[:, -1], 
                                                    test_size = 0.3, 
                                                    random_state = 42)

In [4]:
make_train = X_train["Name"].str.split(" ", expand = True)
make_test = X_test["Name"].str.split(" ", expand = True)

In [5]:
X_train["Manufacturer"] = make_train[0]
X_test["Manufacturer"] = make_test[0]

In [6]:
ManufaturerDict = {
                    'Ford':1,'Maruti':2,'Honda':3,'Audi':4,'Nissan':5,'Hyundai':6,'Mahindra':7,'Tata':8,'BMW':9,'Skoda':10,
                    'Porsche':11,'Toyota':12,'Chevrolet':13,'Mercedes-Benz':14,'Land':15,'Force':16,'Volkswagen':17,                            'Renault':18,'Jaguar':19,'Volvo':20,'Mini':21,'Mitsubishi':22,'Fiat':23,'Ambassador':24,'Datsun':25,
                    'ISUZU':26,'Jeep':27,'Bentley':28,'Smart':29,'Lamborghini':30
                    }
X_train['Manufacture'] = X_train.Manufacturer.map(ManufaturerDict)
X_train.drop("Manufacturer", axis = 1, inplace = True)

In [7]:
X_test['Manufacture'] = X_test.Manufacturer.map(ManufaturerDict)
X_test.drop("Manufacturer", axis = 1, inplace = True)

In [8]:
X_train.drop("Name", axis = 1, inplace = True)
X_test.drop("Name", axis = 1, inplace = True)

In [9]:
X_train.drop("Location", axis = 1, inplace = True)
X_test.drop("Location", axis = 1, inplace = True)

In [10]:
curr_time = datetime.datetime.now()
X_train['Year'] = X_train['Year'].apply(lambda x : curr_time.year - x)
X_test['Year'] = X_test['Year'].apply(lambda x : curr_time.year - x)

In [11]:
mileage_train = X_train["Mileage"].str.split(" ", expand = True)
mileage_test = X_test["Mileage"].str.split(" ", expand = True)

X_train["Mileage"] = pd.to_numeric(mileage_train[0], errors = 'coerce')
X_test["Mileage"] = pd.to_numeric(mileage_test[0], errors = 'coerce')

In [12]:
print(sum(X_train["Mileage"].isnull()))
print(sum(X_test["Mileage"].isnull()))

1
1


In [13]:
X_train["Mileage"].fillna(X_train["Mileage"].astype("float64").mean() ,inplace = True)
X_test["Mileage"].fillna(X_train["Mileage"].astype("float64").mean(), inplace = True)

In [14]:
print(sum(X_test["Manufacture"].isnull()))


1


In [15]:
X_test["Manufacture"].fillna(X_train["Manufacture"].astype("int").mean(), inplace = True)

In [16]:
print(sum(X_test["Manufacture"].isnull()))


0


In [17]:
cc_train = X_train["Engine"].str.split(" ", expand = True)
cc_test = X_test["Engine"].str.split(" ", expand = True)
X_train["Engine"] = pd.to_numeric(cc_train[0], errors = 'coerce')
X_test["Engine"] = pd.to_numeric(cc_test[0], errors = 'coerce')

bhp_train = X_train["Power"].str.split(" ", expand = True)
bhp_test = X_test["Power"].str.split(" ", expand = True)
X_train["Power"] = pd.to_numeric(bhp_train[0], errors = 'coerce')
X_test["Power"] = pd.to_numeric(bhp_test[0], errors = 'coerce')

In [18]:
X_train["Engine"].fillna(X_train["Engine"].astype("float64").mean(), inplace = True)
X_test["Engine"].fillna(X_train["Engine"].astype("float64").mean(), inplace = True)

X_train["Power"].fillna(X_train["Power"].astype("float64").mean(), inplace = True)
X_test["Power"].fillna(X_train["Power"].astype("float64").mean(), inplace = True)

X_train["Seats"].fillna(X_train["Seats"].astype("float64").mean(), inplace = True)
X_test["Seats"].fillna(X_train["Seats"].astype("float64").mean(), inplace = True)

In [19]:
X_test["Manufacture"].fillna(X_train["Manufacture"].astype("float64").mean(), inplace = True)

In [20]:
print(sum(X_test["Manufacture"].isnull()))


0


In [21]:
X_train.drop(["New_Price"], axis = 1, inplace = True)
X_test.drop(["New_Price"], axis = 1, inplace = True)

In [22]:
FuelTypeDict = {
                    'Diesel':1,'Petrol':2,'CNG':3,'LPG':4,'Electric':5
}
X_train['Fuel_type'] = X_train.Fuel_Type.map(FuelTypeDict)
X_train.drop("Fuel_Type", axis = 1, inplace = True)

In [23]:
X_test['Fuel_type'] = X_test.Fuel_Type.map(FuelTypeDict)
X_test.drop("Fuel_Type", axis = 1, inplace = True)

In [24]:
OwnerTypeDict = {
                    'First':1,'Second':2,'Third':3,'Fourth and above':4
}
X_train['Owner_type'] = X_train.Owner_Type.map(OwnerTypeDict)
X_train.drop("Owner_Type", axis = 1, inplace = True)

In [25]:
X_test['Owner_type'] = X_test.Owner_Type.map(OwnerTypeDict)
X_test.drop("Owner_Type", axis = 1, inplace = True)

In [26]:
TransmissionDict = {
                    'Mannual':1,'Automatic':0
}
X_train['Transmission_type'] = X_train.Transmission.map(TransmissionDict)
X_train.drop("Transmission", axis = 1, inplace = True)

In [27]:
X_test['Transmission_type'] = X_test.Transmission.map(TransmissionDict)
X_test.drop("Transmission", axis = 1, inplace = True)

In [28]:
print(sum(X_train["Fuel_type"].isnull()))


0


In [29]:
print(sum(X_train["Owner_type"].isnull()))


8


In [30]:
X_train["Owner_type"].fillna(X_train["Owner_type"].astype("float64").mean(), inplace = True)


In [31]:
print(sum(X_train["Owner_type"].isnull()))


0


In [32]:
print(sum(X_test["Owner_type"].isnull()))


1


In [33]:
X_test["Owner_type"].fillna(X_train["Owner_type"].astype("float64").mean(), inplace = True)


In [34]:
print(sum(X_test["Owner_type"].isnull()))


0


In [35]:
print(sum(X_test["Fuel_type"].isnull()))


0


In [36]:
print(sum(X_train["Transmission_type"].isnull()))


3028


In [37]:
X_test["Transmission_type"].fillna(X_train["Transmission_type"].astype("float64").mean(), inplace = True)


In [38]:
X_train["Transmission_type"].fillna(X_train["Transmission_type"].astype("float64").mean(), inplace = True)


In [39]:
print(sum(X_train["Transmission_type"].isnull()))


0


In [40]:
print(sum(X_test["Transmission_type"].isnull()))


0


In [41]:
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train = standardScaler.transform(X_train)
X_test = standardScaler.transform(X_test)

In [42]:
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2_score(y_test, y_pred)

0.8825838580427326

In [43]:
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
y_pred = linearRegression.predict(X_test)
r2_score(y_test, y_pred)

0.690193483019133

In [44]:
from sklearn.ensemble import GradientBoostingRegressor


In [45]:
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
r2_score(y_test, y_pred)

0.8825719968080576

In [46]:
from sklearn.metrics import sm
print("Mean absolute error =", round(sm.mean_absolute_error(y_test, y_pred), 2)) 



ImportError: cannot import name 'sm' from 'sklearn.metrics' (C:\Users\rasoo\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\metrics\__init__.py)

In [56]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print("r2_score : ", r2_score(y_test, y_pred))
print("mean_abs_error : " ,mean_absolute_error(y_test, y_pred))
print("mean_square_error : " ,mean_squared_error(y_test, y_pred))
  


r2_score :  0.8838314088935328
mean_abs_error :  1.8828881439086698
mean_square_error :  15.120984777988594


In [57]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
y_pred = linearRegression.predict(X_test)
print("r2_score : ", r2_score(y_test, y_pred))
print("mean_abs_error : " ,mean_absolute_error(y_test, y_pred))
print("mean_square_error : " ,mean_squared_error(y_test, y_pred))

r2_score :  0.690193483019133
mean_abs_error :  3.9344747916398193
mean_square_error :  40.32569890682406


In [58]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("r2_score : ", r2_score(y_test, y_pred))
print("mean_abs_error : " ,mean_absolute_error(y_test, y_pred))
print("mean_square_error : " ,mean_squared_error(y_test, y_pred))

r2_score :  0.8743569284816141
mean_abs_error :  1.7690666300822655
mean_square_error :  16.354222374514787
