## Set up the Notebook

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [6]:
%matplotlib inline

## Load Data

In [7]:
car_resale_dataset = pd.read_csv("./data/train.csv")
car_resale_test = pd.read_csv("./data/test.csv")
car_resale_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16784 entries, 0 to 16783
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   listing_id         16784 non-null  int64  
 1   title              16784 non-null  object 
 2   make               14624 non-null  object 
 3   model              16784 non-null  object 
 4   description        16439 non-null  object 
 5   manufactured       16590 non-null  float64
 6   original_reg_date  318 non-null    object 
 7   reg_date           16583 non-null  object 
 8   type_of_vehicle    16784 non-null  object 
 9   category           16784 non-null  object 
 10  transmission       16784 non-null  object 
 11  curb_weight        16205 non-null  float64
 12  power              14447 non-null  float64
 13  fuel_type          3490 non-null   object 
 14  engine_cap         16731 non-null  float64
 15  no_of_owners       16608 non-null  float64
 16  depreciation       163

## Select Features

In [8]:
train = car_resale_dataset[['make', 'manufactured', 'mileage', 'reg_date', 'no_of_owners', 'omv', 'price']]

In [9]:
train.head()

Unnamed: 0,make,manufactured,mileage,reg_date,no_of_owners,omv,price
0,bmw,2013.0,73000.0,09-dec-2013,1.0,45330.0,71300.0
1,,2014.0,110112.0,26-jan-2015,3.0,27502.0,43800.0
2,mercedes-benz,2016.0,80000.0,25-jul-2016,1.0,27886.0,95500.0
3,mercedes-benz,2019.0,9800.0,17-nov-2020,1.0,46412.0,197900.0
4,,2019.0,40000.0,20-sep-2019,1.0,20072.0,103200.0


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16784 entries, 0 to 16783
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   make          14624 non-null  object 
 1   manufactured  16590 non-null  float64
 2   mileage       13060 non-null  float64
 3   reg_date      16583 non-null  object 
 4   no_of_owners  16608 non-null  float64
 5   omv           16743 non-null  float64
 6   price         16784 non-null  float64
dtypes: float64(5), object(2)
memory usage: 918.0+ KB


## Drop N.A. Data

In [11]:
trainc = train.dropna()

In [12]:
trainc.head()
#trainc.info()

Unnamed: 0,make,manufactured,mileage,reg_date,no_of_owners,omv,price
0,bmw,2013.0,73000.0,09-dec-2013,1.0,45330.0,71300.0
2,mercedes-benz,2016.0,80000.0,25-jul-2016,1.0,27886.0,95500.0
3,mercedes-benz,2019.0,9800.0,17-nov-2020,1.0,46412.0,197900.0
6,volvo,2015.0,77777.0,11-dec-2015,3.0,22809.0,62500.0
7,mercedes-benz,2018.0,31787.0,23-jun-2021,1.0,53386.0,367300.0


## Preprocess the Data

In [13]:
trainc.reg_date = pd.to_datetime(trainc.reg_date)
trainc.reg_date = trainc.reg_date.apply(lambda x: float(x.year))
#print(trainc.reg_date.mean(), trainc.reg_date.std())
#trainc.head(70)

date_features = ['reg_date']
trainc[date_features] = trainc[date_features].apply(lambda x: (x - x.mean()) / (x.std()))
trainc.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,make,manufactured,mileage,reg_date,no_of_owners,omv,price
0,bmw,2013.0,73000.0,-0.561878,1.0,45330.0,71300.0
2,mercedes-benz,2016.0,80000.0,0.174285,1.0,27886.0,95500.0
3,mercedes-benz,2019.0,9800.0,1.155835,1.0,46412.0,197900.0
6,volvo,2015.0,77777.0,-0.071103,3.0,22809.0,62500.0
7,mercedes-benz,2018.0,31787.0,1.401222,1.0,53386.0,367300.0


In [14]:
t = trainc.copy()

In [15]:
t[['manufactured', 'mileage', 'no_of_owners']] = t[['manufactured', 'mileage', 'no_of_owners']].apply(lambda x: (x - x.mean()) / (x.std()))

In [16]:
t.head()

Unnamed: 0,make,manufactured,mileage,reg_date,no_of_owners,omv,price
0,bmw,-0.200826,0.008343,-0.561878,-0.717104,45330.0,71300.0
2,mercedes-benz,0.114359,0.147834,0.174285,-0.717104,27886.0,95500.0
3,mercedes-benz,0.429545,-1.251065,1.155835,-0.717104,46412.0,197900.0
6,volvo,0.009298,0.103536,-0.071103,0.86471,22809.0,62500.0
7,mercedes-benz,0.324483,-0.812923,1.401222,-0.717104,53386.0,367300.0


In [17]:
t[['result']] = (t.omv-t.price)/t.omv

In [18]:
t=t.drop(['price', 'omv'], axis=1)

In [19]:
t.head()

Unnamed: 0,make,manufactured,mileage,reg_date,no_of_owners,result
0,bmw,-0.200826,0.008343,-0.561878,-0.717104,-0.57291
2,mercedes-benz,0.114359,0.147834,0.174285,-0.717104,-2.424658
3,mercedes-benz,0.429545,-1.251065,1.155835,-0.717104,-3.263983
6,volvo,0.009298,0.103536,-0.071103,0.86471,-1.740146
7,mercedes-benz,0.324483,-0.812923,1.401222,-0.717104,-5.880081


In [21]:
t.make = t.make.str.lower()

In [22]:
f=pd.get_dummies(t)

In [23]:
f.head()

Unnamed: 0,manufactured,mileage,reg_date,no_of_owners,result,make_alfa romeo,make_alpine,make_aston martin,make_audi,make_austin,...,make_seat,make_skoda,make_ssangyong,make_subaru,make_suzuki,make_tesla,make_toyota,make_ud,make_volkswagen,make_volvo
0,-0.200826,0.008343,-0.561878,-0.717104,-0.57291,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.114359,0.147834,0.174285,-0.717104,-2.424658,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.429545,-1.251065,1.155835,-0.717104,-3.263983,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0.009298,0.103536,-0.071103,0.86471,-1.740146,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0.324483,-0.812923,1.401222,-0.717104,-5.880081,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
f[['pred']] = f[['result']] 

In [25]:
f=f.drop(['result'], axis=1)

In [26]:
f.head()

Unnamed: 0,manufactured,mileage,reg_date,no_of_owners,make_alfa romeo,make_alpine,make_aston martin,make_audi,make_austin,make_bentley,...,make_skoda,make_ssangyong,make_subaru,make_suzuki,make_tesla,make_toyota,make_ud,make_volkswagen,make_volvo,pred
0,-0.200826,0.008343,-0.561878,-0.717104,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-0.57291
2,0.114359,0.147834,0.174285,-0.717104,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-2.424658
3,0.429545,-1.251065,1.155835,-0.717104,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-3.263983
6,0.009298,0.103536,-0.071103,0.86471,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1.740146
7,0.324483,-0.812923,1.401222,-0.717104,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-5.880081


## Export to CSV File

In [27]:
f.to_csv("./data/preprocessed_data_task3.csv",index=None)