In [49]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch


torch.manual_seed(1234)

# Import Dataset from CSV File in Data Folder
data=pd.read_csv("data/cardetailsv3.csv")
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [50]:
# Analyze Data
data.info()
data.describe()
data.columns
data.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64

In [51]:
# Show all unique names
data['name'].unique()

array(['Maruti Swift Dzire VDI', 'Skoda Rapid 1.5 TDI Ambition',
       'Honda City 2017-2020 EXi', ..., 'Tata Nexon 1.5 Revotorq XT',
       'Ford Freestyle Titanium Plus Diesel BSIV',
       'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV'], dtype=object)

In [52]:
def convert_col_to_numerical(df, col):
    unique_vals = df[col].unique()
    val_dict = {}
    count = 0
    for val in unique_vals:
        val_dict[val] = count
        count += 1
    df[col] = df[col].replace(val_dict)
    return df

In [53]:
# Get rid of torque
data = data.drop('torque', axis=1)

# Edit every name in name column to be just 2 words
data['name'] = data['name'].str.split(' ').str[:2].str.join(' ')
data['name'] = data['name'].str.replace(' ', '_')

# Convert categorical columns to numerical
data = convert_col_to_numerical(data, 'name')
data = convert_col_to_numerical(data, 'fuel')
data = convert_col_to_numerical(data, 'seller_type')
data = convert_col_to_numerical(data, 'transmission')
data = convert_col_to_numerical(data, 'owner')

# Edit every numerical column to be just the number
data['mileage'] = data['mileage'].str.split(' ').str[0]
data['engine'] = data['engine'].str.split(' ').str[0]
data['max_power'] = data['max_power'].str.split(' ').str[0]

# Convert numerical columns to numerical
data['mileage'] = pd.to_numeric(data['mileage'])
data['engine'] = pd.to_numeric(data['engine'])
data['max_power'] = pd.to_numeric(data['max_power'])


In [54]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,0,2014,450000,145500,0,0,0,0,23.4,1248.0,74.0,5.0
1,1,2014,370000,120000,0,0,0,1,21.14,1498.0,103.52,5.0
2,2,2006,158000,140000,1,0,0,2,17.7,1497.0,78.0,5.0
3,3,2010,225000,127000,0,0,0,0,23.0,1396.0,90.0,5.0
4,0,2007,130000,120000,1,0,0,0,16.1,1298.0,88.2,5.0


In [55]:
# What are the datatypes
data.dtypes


name               int64
year               int64
selling_price      int64
km_driven          int64
fuel               int64
seller_type        int64
transmission       int64
owner              int64
mileage          float64
engine           float64
max_power        float64
seats            float64
dtype: object

In [56]:
# Convert dataset to all floats
data = data.astype(float)

In [57]:
# Analyse the NAN values
data.isnull().sum()

# Drop all NAN values
data = data.dropna()