In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
# import pickle
from sklearn.metrics import mean_absolute_error
import os

In [None]:
data = pd.read_csv(os.getcwd()+'/cardekho_updated.csv')

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
for i in range(data.shape[0]):
    try:
        price = float(data['selling_price'][i].split(' ')[0])
        digit = data['selling_price'][i].split(' ')[1]
        if digit == 'Lakh*':
            price = price * 100000
            data['selling_price'][i] = price
        elif digit == 'Cr*':
            price = price * 10000000
            data['selling_price'][i] = price
    except:
        price = data['selling_price'][i][:-1]
        price = price.replace(',', '')
        data['selling_price'][i] = float(price)

In [None]:
data['km_driven'] = data['km_driven'].str.split(' ', n=1, expand=True)[0]
data['km_driven'] = data['km_driven'].str.replace(',','')
data['mileage'] = data['mileage'].str.split(' ', expand=True)[0].str.split('e', expand=True)[2]
data['engine'] = data['engine'].str.split(' ', expand=True)[0].str.split('e',expand=True)[1]
data['max_power'] = data['max_power'].str.split(' ', expand=True)[1].str.split('r',expand=True)[1]
data['seats'] = data['seats'].str.split('s', expand=True)[1]

In [None]:
cols = ['selling_price', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']

for col in cols:
    try:
        data[col] = data[col].astype(int)
    except:
        data[col] = data[col].astype(float)

In [None]:
data['company'] = data['full_name'].str.split(' ', expand=True)[0]

In [None]:
data.drop(columns=['new_price','full_name','owner_type'], axis=1, inplace=True)
data.head()

In [None]:
for i in ['mileage', 'engine', 'max_power', 'seats']:
    company_name = data[data[i].isnull()]['company'].value_counts().index[0]
    if data[i].nunique()>10:
        values = data[data['company']==company_name][i].mean()
    else:
        values = data[data['company']==company_name][i].median()
        
    data[i].fillna(values, inplace=True)

In [None]:
data = data[data['selling_price'] < 20000000]
data = data[data['km_driven'] < 1000000]
data = data[data['mileage'] < 100]
data = data[data['engine'] < 6100]
data = data[data['max_power'] < 530]
data = data.reset_index(drop=True)

In [None]:
company_name = data.company.value_counts().index[:15]
for i in range(data.shape[0]):
    if data['company'][i] in company_name:
         continue
    else:
        data['company'][i] = 'others'

In [None]:
# data.to_csv("before.csv")

In [None]:
data = pd.get_dummies(data=data, columns=['seller_type','fuel_type','transmission_type','company'], drop_first=True)
data.shape

In [None]:
x = data.iloc[:,1:]
y = data['selling_price']
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.10,random_state=25)
print(xtrain.shape,xtest.shape)

In [None]:
# pd.concat([xtrain, xtest]).to_csv("after.csv")

In [None]:
def do_prediction(classifier):
    classifier.fit(xtrain, ytrain)
    prediction = classifier.predict(xtest)
    cross_validation_score = cross_val(xtrain, ytrain, classifier)
    error = mean_absolute_error(ytest, prediction)
    return error, cross_validation_score

def cross_val(xtrain, ytrain, classifier):
    accuracies = cross_val_score(estimator = classifier, X = xtrain, y = ytrain, cv = 5)
    return accuracies.mean()

In [None]:
model = RandomForestRegressor()
error, score = do_prediction(model)

print('Random Forest Regressor MAE: {}'.format(round(error,2)))
print('Cross validation score: {}'.format(round(score,2)))

In [None]:
# pickle.dump(model, open('reg_model.pkl', 'wb'))