In [7]:
import pandas as pd
import tensorflow as tf
import numpy as np
import keras
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Preprocessing
def preprocess(df):
    # nan_cols = df.isna().any()
    # print("Empty Cols = ")
    # print(nan_cols[nan_cols == True].index.tolist())
    df.dropna(inplace=True)
    df['Fuel_Type_Encoded']=df['Fuel_Type'].astype('category').cat.codes
    df['Transmission_Encoded']=df['Transmission'].astype('category').cat.codes
    df['Owner_Type_Encoded']=df['Owner_Type'].astype('category').cat.codes
    df['New_Price'] = df['New_Price'].str.replace('Lakh', '')
    df['New_Price'] = df['New_Price'].str.replace('Cr', '')
    df['Power'] = df['Power'].str.replace('bhp', '')
    df['Engine'] = df['Engine'].str.replace('CC', '')
    df['Mileage'] = df['Mileage'].str.replace('kmpl', '')
    df['Mileage'] = df['Mileage'].str.replace('km/kg', '')
    df['New_Price'] = df['New_Price'].str.strip()
    df['Power'] = df['Power'].str.strip()
    df['Engine'] = df['Engine'].str.strip()
    df['Mileage'] = df['Mileage'].str.strip()
    df['Mileage']=df['Mileage'].astype(float)
    df['Engine']=df['Engine'].astype(float)
    df['Power']=df['Power'].astype(float)
    df['New_Price']=df['New_Price'].astype(float)
    df=df.reset_index(drop=True)
    return df

# Feature Engineering
def calcAge(year):
    current_year = datetime.now().year
    return current_year-year

def norm(srs):
    scaler = MinMaxScaler()
    scaled_srs = scaler.fit_transform(srs.values.reshape(-1,1))
    return scaled_srs
    
def feature_engineering(df):
    df['Age']=df['Year'].apply(calcAge)
    df['Kilometers_Driven']=norm(df['Kilometers_Driven'])
    df['Engine']=norm(df["Engine"])
    df['Power']=norm(df["Power"])
    df['Mileage']=norm(df["Mileage"])
    df['Price_Drop']=df['New_Price']-df['Price']
    df['Price']=norm(df["Price"])
    df['Price_Drop']=norm(df["Price_Drop"])
    df['New_Price']=norm(df["New_Price"])
    df['Fuel_Type']=df['Fuel_Type_Encoded']
    df['Owner_Type']=df['Owner_Type_Encoded']
    df['Transmission']=df['Transmission_Encoded']
    df['Name']=df['Name'].str.extract('(\w+)')
    df['Name']=df['Name'].astype('category').cat.codes
    df.drop(['Transmission_Encoded','Owner_Type_Encoded','Fuel_Type_Encoded','Location','New_Price','Price','Year'],axis=1,inplace=True)
    return df

def test_features(df):
    df['Age']=df['Year'].apply(calcAge)
    df['Kilometers_Driven']=norm(df['Kilometers_Driven'])
    df['Engine']=norm(df["Engine"])
    df['Power']=norm(df["Power"])
    df['Mileage']=norm(df["Mileage"])
    # df['Price_Drop']=df['New_Price']-df['Price']
    # df['Price']=norm(df["Price"])
    # df['Price_Drop']=norm(df["Price_Drop"])
    df['New_Price']=norm(df["New_Price"])
    df['Fuel_Type']=df['Fuel_Type_Encoded']
    df['Owner_Type']=df['Owner_Type_Encoded']
    df['Transmission']=df['Transmission_Encoded']
    df['Name']=df['Name'].str.extract('(\w+)')
    df['Name']=df['Name'].astype('category').cat.codes
    df.drop(['Transmission_Encoded','Owner_Type_Encoded','Fuel_Type_Encoded','Location','Year'],axis=1,inplace=True)
    return df

traindf=preprocess(pd.read_csv('./Datasets/train_data.csv'))
traindf=feature_engineering(traindf)
testdf=preprocess(pd.read_csv('./Datasets/test_data.csv'))
testdf=test_features(testdf)


# X = df.drop('target_variable', axis=1)
# y = df['target_variable']

In [8]:
traindf

Unnamed: 0,Name,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Age,Price_Drop
0,5,0.210280,2,1,0,0.542636,0.181818,0.090607,5.0,12,0.671489
1,21,0.163551,1,0,0,0.338700,0.889091,0.302534,8.0,7,0.667384
2,12,0.115383,2,1,0,0.642815,0.301364,0.127847,5.0,5,0.648543
3,15,0.509346,1,1,0,0.402504,0.762727,0.312926,7.0,9,0.758294
4,1,0.149449,1,0,0,0.676506,0.543636,0.349885,5.0,9,0.841128
...,...,...,...,...,...,...,...,...,...,...,...
476,5,0.146019,2,1,0,0.521765,0.181818,0.090607,5.0,6,0.657762
477,6,0.281893,2,1,0,0.563506,0.180909,0.073100,5.0,9,0.668730
478,5,0.317757,2,1,0,0.581395,0.181818,0.090760,5.0,9,0.664154
479,22,0.207907,1,0,0,0.641026,0.317727,0.141285,5.0,7,0.680035


In [9]:
testdf

Unnamed: 0,Name,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Age
0,1,0.380165,1,0,0,0.670244,0.288054,0.350384,5.0,0.675081,10
1,17,0.093689,2,1,1,0.425760,0.240607,0.217468,5.0,0.194647,7
2,4,0.023022,2,0,0,0.536673,0.168112,0.164450,5.0,0.140815,5
3,14,0.248205,1,0,0,0.584377,0.159441,0.141816,5.0,0.139599,6
4,9,0.370189,1,1,0,0.846750,0.108141,0.053708,5.0,0.069444,8
...,...,...,...,...,...,...,...,...,...,...,...
337,16,0.055490,2,1,0,0.523852,0.094894,0.091304,5.0,0.065795,7
338,18,0.220779,2,1,0,0.479726,0.192437,0.129156,5.0,0.107766,12
339,9,0.216104,1,1,0,0.724508,0.108141,0.090793,5.0,0.087693,7
340,4,0.195667,2,1,0,0.551580,0.096098,0.086445,5.0,0.054238,10
