In [1]:
import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [2]:
dataset = pd.read_csv("data/dataset.csv")


In [3]:
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], 
                                                    dataset.iloc[:, -1], 
                                                    test_size = 0.3, 
                                                    random_state = 42)

In [4]:
make_train = X_train["Name"].str.split(" ", expand = True)
make_test = X_test["Name"].str.split(" ", expand = True)

In [5]:
X_train["Manufacturer"] = make_train[0]
X_test["Manufacturer"] = make_test[0]

In [6]:
ManufaturerDict = {
                    'Ford':1,'Maruti':2,'Honda':3,'Audi':4,'Nissan':5,'Hyundai':6,'Mahindra':7,'Tata':8,'BMW':9,'Skoda':10,
                    'Porsche':11,'Toyota':12,'Chevrolet':13,'Mercedes-Benz':14,'Land':15,'Force':16,'Volkswagen':17,                            'Renault':18,'Jaguar':19,'Volvo':20,'Mini':21,'Mitsubishi':22,'Fiat':23,'Ambassador':24,'Datsun':25,
                    'ISUZU':26,'Jeep':27,'Bentley':28,'Smart':29,'Lamborghini':30
                    }
X_train['Manufacture'] = X_train.Manufacturer.map(ManufaturerDict)
X_train.drop("Manufacturer", axis = 1, inplace = True)

In [7]:
X_test['Manufacture'] = X_test.Manufacturer.map(ManufaturerDict)
X_test.drop("Manufacturer", axis = 1, inplace = True)


In [8]:
X_train.drop("Name", axis = 1, inplace = True)
X_test.drop("Name", axis = 1, inplace = True)

In [9]:
X_train.drop("Location", axis = 1, inplace = True)
X_test.drop("Location", axis = 1, inplace = True)

In [10]:
curr_time = datetime.datetime.now()
X_train['Year'] = X_train['Year'].apply(lambda x : curr_time.year - x)
X_test['Year'] = X_test['Year'].apply(lambda x : curr_time.year - x)

In [11]:
mileage_train = X_train["Mileage"].str.split(" ", expand = True)
mileage_test = X_test["Mileage"].str.split(" ", expand = True)

X_train["Mileage"] = pd.to_numeric(mileage_train[0], errors = 'coerce')
X_test["Mileage"] = pd.to_numeric(mileage_test[0], errors = 'coerce')

In [12]:
print(sum(X_train["Mileage"].isnull()))
print(sum(X_test["Mileage"].isnull()))

1
1


In [13]:
X_train["Mileage"].fillna(X_train["Mileage"].astype("float64").mean() ,inplace = True)
X_test["Mileage"].fillna(X_train["Mileage"].astype("float64").mean(), inplace = True)

In [14]:
print(sum(X_test["Manufacture"].isnull()))


1


In [15]:
X_test["Manufacture"].fillna(X_train["Manufacture"].astype("int").mean(), inplace = True)

In [16]:
print(sum(X_test["Manufacture"].isnull()))


0


In [17]:
cc_train = X_train["Engine"].str.split(" ", expand = True)
cc_test = X_test["Engine"].str.split(" ", expand = True)
X_train["Engine"] = pd.to_numeric(cc_train[0], errors = 'coerce')
X_test["Engine"] = pd.to_numeric(cc_test[0], errors = 'coerce')

bhp_train = X_train["Power"].str.split(" ", expand = True)
bhp_test = X_test["Power"].str.split(" ", expand = True)
X_train["Power"] = pd.to_numeric(bhp_train[0], errors = 'coerce')
X_test["Power"] = pd.to_numeric(bhp_test[0], errors = 'coerce')

In [18]:
X_train["Engine"].fillna(X_train["Engine"].astype("float64").mean(), inplace = True)
X_test["Engine"].fillna(X_train["Engine"].astype("float64").mean(), inplace = True)

X_train["Power"].fillna(X_train["Power"].astype("float64").mean(), inplace = True)
X_test["Power"].fillna(X_train["Power"].astype("float64").mean(), inplace = True)

X_train["Seats"].fillna(X_train["Seats"].astype("float64").mean(), inplace = True)
X_test["Seats"].fillna(X_train["Seats"].astype("float64").mean(), inplace = True)

In [19]:
X_test["Manufacture"].fillna(X_train["Manufacture"].astype("float64").mean(), inplace = True)

In [20]:
print(sum(X_test["Manufacture"].isnull()))


0


In [21]:
X_train.drop(["New_Price"], axis = 1, inplace = True)
X_test.drop(["New_Price"], axis = 1, inplace = True)

In [22]:
X_train.head(5)

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Manufacture
841,6,61000,Diesel,Manual,First,18.6,1388.0,67.0,5.0,1
3378,3,6006,Petrol,Manual,First,20.73,1373.0,91.1,5.0,2
1189,15,81000,Petrol,Manual,Second,12.8,1493.0,100.0,5.0,3
175,9,53000,Diesel,Automatic,First,12.8,1968.0,167.6,5.0,4
4747,5,46173,Diesel,Manual,Second,19.01,1461.0,108.5,5.0,5


In [23]:
X_test.head(5)

Unnamed: 0,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Manufacture
1315,3,39430,Petrol,Automatic,First,18.0,1497.0,117.3,5.0,3.0
5824,7,62000,Diesel,Manual,First,22.07,1199.0,73.9,5.0,17.0
1744,6,18000,Petrol,Manual,First,21.1,814.0,55.2,5.0,6.0
1860,4,15000,Petrol,Automatic,First,18.9,1197.0,82.0,5.0,6.0
1559,6,23000,Diesel,Manual,First,24.5,1498.0,98.6,7.0,3.0


In [24]:
FuelTypeDict = {
                    'Diesel':1,'Petrol':2,'CNG':3,'LPG':4,'Electric':5
}
X_train['Fuel_type'] = X_train.Fuel_Type.map(FuelTypeDict)
X_train.drop("Fuel_Type", axis = 1, inplace = True)

In [25]:
X_test['Fuel_type'] = X_test.Fuel_Type.map(FuelTypeDict)
X_test.drop("Fuel_Type", axis = 1, inplace = True)

In [26]:
X_train.head(5)

Unnamed: 0,Year,Kilometers_Driven,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Manufacture,Fuel_type
841,6,61000,Manual,First,18.6,1388.0,67.0,5.0,1,1
3378,3,6006,Manual,First,20.73,1373.0,91.1,5.0,2,2
1189,15,81000,Manual,Second,12.8,1493.0,100.0,5.0,3,2
175,9,53000,Automatic,First,12.8,1968.0,167.6,5.0,4,1
4747,5,46173,Manual,Second,19.01,1461.0,108.5,5.0,5,1


In [27]:
X_test.head(5)

Unnamed: 0,Year,Kilometers_Driven,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Manufacture,Fuel_type
1315,3,39430,Automatic,First,18.0,1497.0,117.3,5.0,3.0,2
5824,7,62000,Manual,First,22.07,1199.0,73.9,5.0,17.0,1
1744,6,18000,Manual,First,21.1,814.0,55.2,5.0,6.0,2
1860,4,15000,Automatic,First,18.9,1197.0,82.0,5.0,6.0,2
1559,6,23000,Manual,First,24.5,1498.0,98.6,7.0,3.0,1


In [28]:
X_test.head(5)

Unnamed: 0,Year,Kilometers_Driven,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Manufacture,Fuel_type
1315,3,39430,Automatic,First,18.0,1497.0,117.3,5.0,3.0,2
5824,7,62000,Manual,First,22.07,1199.0,73.9,5.0,17.0,1
1744,6,18000,Manual,First,21.1,814.0,55.2,5.0,6.0,2
1860,4,15000,Automatic,First,18.9,1197.0,82.0,5.0,6.0,2
1559,6,23000,Manual,First,24.5,1498.0,98.6,7.0,3.0,1


In [29]:
OwnerTypeDict = {
                    'First':1,'Second':2,'Third':3,'Fourth and above':4
}
X_train['Owner_type'] = X_train.Owner_Type.map(OwnerTypeDict)
X_train.drop("Owner_Type", axis = 1, inplace = True)

In [31]:
X_test['Owner_type'] = X_test.Owner_Type.map(OwnerTypeDict)
X_test.drop("Owner_Type", axis = 1, inplace = True)

In [32]:
X_test.head(5)

Unnamed: 0,Year,Kilometers_Driven,Transmission,Mileage,Engine,Power,Seats,Manufacture,Fuel_type,Owner_type
1315,3,39430,Automatic,18.0,1497.0,117.3,5.0,3.0,2,1.0
5824,7,62000,Manual,22.07,1199.0,73.9,5.0,17.0,1,1.0
1744,6,18000,Manual,21.1,814.0,55.2,5.0,6.0,2,1.0
1860,4,15000,Automatic,18.9,1197.0,82.0,5.0,6.0,2,1.0
1559,6,23000,Manual,24.5,1498.0,98.6,7.0,3.0,1,1.0


In [33]:
TransmissionDict = {
                    'Mannual':1,'Automatic':0
}
X_train['Transmission_type'] = X_train.Transmission.map(TransmissionDict)
X_train.drop("Transmission", axis = 1, inplace = True)

In [34]:
X_test['Transmission_type'] = X_test.Transmission.map(TransmissionDict)
X_test.drop("Transmission", axis = 1, inplace = True)

In [35]:
X_train.head(5)

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Manufacture,Fuel_type,Owner_type,Transmission_type
841,6,61000,18.6,1388.0,67.0,5.0,1,1,1.0,
3378,3,6006,20.73,1373.0,91.1,5.0,2,2,1.0,
1189,15,81000,12.8,1493.0,100.0,5.0,3,2,2.0,
175,9,53000,12.8,1968.0,167.6,5.0,4,1,1.0,0.0
4747,5,46173,19.01,1461.0,108.5,5.0,5,1,2.0,


In [36]:
print(sum(X_train["Fuel_type"].isnull()))


0


In [37]:
print(sum(X_train["Owner_type"].isnull()))


8


In [38]:
X_train["Owner_type"].fillna(X_train["Owner_type"].astype("float64").mean(), inplace = True)


In [39]:
print(sum(X_train["Owner_type"].isnull()))


0


In [40]:
print(sum(X_test["Owner_type"].isnull()))


1


In [41]:
X_test["Owner_type"].fillna(X_train["Owner_type"].astype("float64").mean(), inplace = True)


In [42]:
print(sum(X_test["Owner_type"].isnull()))


0


In [43]:
print(sum(X_test["Fuel_type"].isnull()))


0


In [44]:
print(sum(X_train["Transmission_type"].isnull()))


3028


In [45]:
X_test["Transmission_type"].fillna(X_train["Transmission_type"].astype("float64").mean(), inplace = True)


In [46]:
X_train["Transmission_type"].fillna(X_train["Transmission_type"].astype("float64").mean(), inplace = True)


In [47]:
print(sum(X_train["Transmission_type"].isnull()))


0


In [48]:
print(sum(X_test["Transmission_type"].isnull()))


0


In [49]:
X_train.head(5)

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Manufacture,Fuel_type,Owner_type,Transmission_type
841,6,61000,18.6,1388.0,67.0,5.0,1,1,1.0,0.0
3378,3,6006,20.73,1373.0,91.1,5.0,2,2,1.0,0.0
1189,15,81000,12.8,1493.0,100.0,5.0,3,2,2.0,0.0
175,9,53000,12.8,1968.0,167.6,5.0,4,1,1.0,0.0
4747,5,46173,19.01,1461.0,108.5,5.0,5,1,2.0,0.0


In [50]:
X_test.head(5)

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Manufacture,Fuel_type,Owner_type,Transmission_type
1315,3,39430,18.0,1497.0,117.3,5.0,3.0,2,1.0,0.0
5824,7,62000,22.07,1199.0,73.9,5.0,17.0,1,1.0,0.0
1744,6,18000,21.1,814.0,55.2,5.0,6.0,2,1.0,0.0
1860,4,15000,18.9,1197.0,82.0,5.0,6.0,2,1.0,0.0
1559,6,23000,24.5,1498.0,98.6,7.0,3.0,1,1.0,0.0


In [51]:
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train = standardScaler.transform(X_train)
X_test = standardScaler.transform(X_test)

In [52]:
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2_score(y_test, y_pred)

0.8721760072565564

In [1]:
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
y_pred = linearRegression.predict(X_test)
r2_score(y_test, y_pred)

NameError: name 'LinearRegression' is not defined

In [55]:
import pickle
file = open('RandomForestModel.pkl', 'wb')
pickle.dump(rf, file)

In [56]:
pickle.dump(standardScaler,open('Scaler.sav', 'wb'))