In [None]:
import sys

sys.path.append("..")

In [None]:
import pandas as pd

from src.transformers import *
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    BaggingRegressor,
)
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [None]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")


## Outlier Detection

### Describing the numerical column

In [None]:
train.describe()[['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 
                'dereg_value', 'mileage', 'omv', 'arf']]

### Using Box-Plot to identify Outliers

In [None]:
plt.figure(figsize=(15,20))
numeric_cols = train.select_dtypes([np.number]).columns
cols = ['manufactured', 'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 'arf', 'price']
cols = ['manufactured', 
        'curb_weight', 'power', 'engine_cap', 'depreciation', 'coe', 'road_tax', 
                'dereg_value', 'mileage', 'omv', 'arf'
       ]
for i, col in enumerate(cols):
    plt.subplot(5, 3, i+1)
    sns.boxplot(data=train, y=col)

In [None]:
sns.boxplot(data=train, y="coe", )

In [None]:
train[train.opc_scheme == '1100'][["listing_id", "title", "opc_scheme", "price"]]

In [None]:
sns.histplot(data=train["opc_scheme"], )

### Observations

* Manufactured column contains values greater 2021, which can't be possible. It looks like data-entry error. A good intuitive way to handle this is to replace the value with OrigRegYear. That may give a close appromixation of the original value.

* Curb_weight: As shown in the table above and evident from box-plot, the column contains values less than 500 Kg. A car can't have weight this low. To handle such cases we will replace the values with the mode of similar make and model car.

* Mileage: Contains outliers, shown in box-blot. There are cars with mileage above 500000, where the median is 35000 and 98% of the cars are below 200000. To handle such cases we will replace the values with the mode of manufactured year.

* Engine_cap: There are cars with engine capacity as 0. This isn't possible, so to handle such cases we'll replace such values with cars of similar make and model

* Depreciation: For some cars depreciation is higher than the price itself. 

* COE: Outlier there in the data. The distribution graph seems off as well (chart below). Looking at the SgcarMart website, COE is represented in dollar value and in years. Need consistency. 


#### COE Dist plot

In [None]:
sns.distplot(train['coe'],color="y")

### EDA

In [None]:
plt.figure(figsize=(20,4))

temp = train.groupby("make").count()['model'].reset_index()
temp = temp[temp.model>100].make.values
plt.subplot(1,2,1)
sns.countplot(x="make", data=train[train.make.isin(temp)],order = train[train.make.isin(temp)].make.value_counts(ascending=True).index)
plt.xticks(rotation=90)
plt.title("Frequency of cars by brand")

plt.subplot(1,2,2)
sns.countplot(x="type_of_vehicle", data=train, order = train.type_of_vehicle.value_counts(ascending=True).index)
plt.xticks(rotation=90)
plt.title("Frequency of cars by type_of_vehicle")

In [None]:
plt.figure(figsize=(20,4))
df = train.groupby(['type_of_vehicle']).mean().reset_index()
plt.subplot(1,2,1)
sns.barplot(data=df.sort_values(by=['price']), x='type_of_vehicle', y='price')
plt.xticks(rotation=90)
plt.title("Mean price values by type_of_vehicle")

df = train.groupby(['make']).mean().reset_index()
df = df[df['price'] > 100000]
plt.subplot(1,2,2)
sns.barplot(data=df.sort_values(by=['price']), x='make', y='price')
plt.title("Mean price values by brand")
plt.xticks(rotation=90)

### Log Transformation of Mileage and COE Feature

In [None]:
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.distplot(train['mileage'],color="y")
plt.title("Distribution without transformation")

plt.subplot(1,2,2)
sns.distplot(np.log(train.mileage),color="y")
plt.title("Distribution with Log transformation")

In [None]:
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.distplot(train['coe'],color="y")
plt.title("Distribution without transformation")

plt.subplot(1,2,2)
sns.distplot(np.log(train.coe),color="y")
plt.title("Distribution with Log transformation")

### Observation:
Two distributions one at the start and other at the end. Clearly different formats -> Outliers.

### Bivariate Analysis

In [None]:
plt.figure(figsize=(12,10))
cor = train.drop(['listing_id', 'indicative_price'], axis=1).corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
sns.boxplot(data=train, y='type_of_vehicle', x='price')

In [None]:
plt.figure(figsize=(20,8))

plt.subplot(1,2,1)
sns.scatterplot(train.power,train.price,color=['c'])

plt.subplot(1,2,2)
sns.scatterplot((train.mileage),train['price'],color=['c'])

* Higher power higher price
* Higher mileage lower price generally

### Price feature 

In [None]:
plt.figure(figsize=(20,8))

plt.subplot(1,2,1)
plt.title(' Price Distribution Plot')
sns.distplot(train.price)

plt.subplot(1,2,2)
plt.title(' Price')
sns.boxplot(y=train.price)

plt.show()

* There is a significant difference between the mean and the median of the price distribution.
* 90% of the prices are below 200K, whereas the remaining 10% are between 200K and 3000K.
* The distribution is positively skewed with the tail on the right side.