# Analysis of Vehicles dataset (Beginner's Analysis)

1.2 Million Used Car Listings
1.2 Million listings scraped from TrueCar.com - Price, Mileage, Make, Model

link: https://www.kaggle.com/jpayne/852k-used-car-listings

In [None]:
import sys
print(sys.executable)

## 1. First we import necessary Libaries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

## 2.Reading and Exploring the Data

### Load Vehicles Data

In [None]:
vehicles = pd.read_csv("datasets/true_car_listings.csv")
vehicles.info()

In [None]:
vehicles.sample(5)

In [None]:
vehicles.describe()

In [None]:
vehicles.shape

In [None]:
columns = list(vehicles.columns)
columns.remove('Price')
columns

## 3 Dataset for car from 1970 and price is not greater than 50k

### 3.1 Data Analysis

In [None]:
# vehicle_top_price = vehicles.loc[(vehicles.Year>=1970) & (vehicles.Price>=1000) & (vehicles.Price<=50000) 
#                                  & (vehicles.Mileage<=300000)].loc[:, ['Price', 'Year', 
#                                                                                 'Mileage', 'City', 'State', 'Make', 'Model']]

vehicle_top_price = vehicles.loc[(vehicles.Year>=1970) & (vehicles.Price>=1000) & (vehicles.Price<=50000) 
                                 & (vehicles.Mileage<=300000)]
vehicle_top_price.shape

In [None]:
vehicle_top_price.sample(5)

In [None]:
vehicle_top_price.describe()

In [None]:
vehicle_top_price.dtypes

### 3.2 Cleaning of data

In [None]:
# https://stackoverflow.com/a/45355563/2049763
vehicle_top_price = vehicle_top_price.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [None]:
for col in ['City', 'Make', 'Model']:
    vehicle_top_price[col] = vehicle_top_price[col].apply(lambda x: x.lower())

In [None]:
from collections import Counter

In [None]:
selector = vehicle_top_price['State'] == "CO"
Counter(selector)

In [None]:
state_list = vehicle_top_price['State'].unique().tolist()
state_list.sort()
len(state_list), ", ".join(state_list)

In [None]:
vehicle_top_price.loc[(vehicle_top_price['State'] == 'Oh'), 'State'] = 'OH'
vehicle_top_price.loc[(vehicle_top_price['State'] == 'Va'), 'State'] = 'VA'
vehicle_top_price.loc[(vehicle_top_price['State'] == 'Md'), 'State'] = 'MD'
vehicle_top_price.loc[(vehicle_top_price['State'] == 'Ca'), 'State'] = 'CA'
vehicle_top_price.loc[(vehicle_top_price['State'] == 'Ga'), 'State'] = 'GA'
vehicle_top_price.loc[(vehicle_top_price['State'] == 'ga'), 'State'] = 'GA'
vehicle_top_price.loc[(vehicle_top_price['State'] == 'Fl'), 'State'] = 'FL'
vehicle_top_price.loc[(vehicle_top_price['State'] == 'Az'), 'State'] = 'AZ'

#### 3.2.2 Drop Null

In [None]:
vehicle_top_price.isnull().any()

In [None]:
vehicle_top_price.isnull().sum()

In [None]:
# lets drop null rows
vehicle_top_price = vehicle_top_price.dropna()

In [None]:
vehicle_top_price.shape

In [None]:
vehicle_top_price.sample(5)

### 3.4 EDA

In [None]:
import pandas_profiling as pp

In [None]:
profile = pp.ProfileReport(vehicle_top_price, title='Pandas Profiling Report', explorative=True)
profile.to_widgets()
# profile.to_notebook_iframe()

### 3.3 Normalize the Data
Used Cars Price Prediction by 15 models
https://www.kaggle.com/vbmokin/used-cars-price-prediction-by-15-models

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
car_price_mean = vehicle_top_price['Price'].mean()
vehicle_top_price['Price'] = (vehicle_top_price['Price'] / car_price_mean).astype('float64')

In [None]:
car_year_min = vehicle_top_price['Year'].min()
vehicle_top_price['Year'] = (vehicle_top_price['Year'] - car_year_min).astype(int)

In [None]:
car_mileage_mean = vehicle_top_price['Mileage'].mean()
vehicle_top_price['Mileage'] = (vehicle_top_price['Mileage'] / car_mileage_mean).astype('float64')

In [None]:
# # perform one hot encoding on multiple categorical columns: https://datascience.stackexchange.com/a/71805
# # https://stackoverflow.com/a/44601764
# vehicle_top_price = pd.get_dummies(vehicle_top_price, columns=['City', 'State', 'Make', 'Model'], drop_first=True)
# vehicle_top_price.columns

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

label_encoder = {}
for col in ['City', 'State', 'Make', 'Model']:
    label_encoder[col] = LabelEncoder()
    label_encoder[col].fit(list(vehicle_top_price[col].astype(str).values))
    vehicle_top_price[col] = label_encoder[col].transform(list(vehicle_top_price[col].astype(str).values))
    label_encoder[col].get_params()

In [None]:
vehicle_top_price.sample(5)

In [None]:
vehicle_top_price.info()

In [None]:
vehicle_top_price.corr()

In [None]:
vehicle_top_price.describe()

### 3.4 EDA

In [None]:
import seaborn as sns
import statsmodels.graphics.api as smg

In [None]:
# https://seaborn.pydata.org/tutorial/aesthetics.html
sns.set_theme()

# https://seaborn.pydata.org/generated/seaborn.pairplot.html
# sns.pairplot(vehicle_top_price, hue="Price", diag_kind="hist")
g = sns.pairplot(vehicle_top_price)
g.fig.set_size_inches(35,35)

In [None]:
profile_n = pp.ProfileReport(vehicle_top_price, title='Pandas Profiling Report')
profile_n.to_widgets()

### 3.5 Regression 

In [None]:
from sklearn.model_selection import KFold, train_test_split

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# models
from sklearn.linear_model import LinearRegression, SGDRegressor, RidgeCV
from sklearn.svm import SVR, LinearSVR

from sklearn.neural_network import MLPRegressor

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor 
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, VotingRegressor 

from sklearn.tree import DecisionTreeRegressor

In [None]:
def acc_d(y_meas, pred_y):
    # Relative error between predicted y_pred and measured y_meas values
    return mean_absolute_error(y_meas, pred_y)*len(y_meas)/sum(abs(y_meas))

def acc_rmse(y_meas, pred_y):
    # RMSE between predicted y_pred and measured y_meas values
    return (mean_squared_error(y_meas, pred_y))**0.5

In [None]:
def accuracy_model(kf5, Model, feature, target):
    r2_scores, rmse = [], []
    for train_index, test_index in kf5.split(feature):
        train_x = np.array(feature.iloc[train_index][:])
        test_x  = np.array(feature.iloc[test_index][:])

        train_y =  target.iloc[train_index][:]
        test_y  =  target.iloc[test_index][:]

        model = Model().fit(train_x, train_y)
        
        pred_train = model.predict(train_x)
        pred_y = model.predict(test_x)
        
        print("\n# training performance")
        acc_train_r2_num = round(r2_score(train_y, pred_train) * 100, 2)
        print('acc(r2_score) for training =', acc_train_r2_num)  
        
        acc_train_d_num = round(acc_d(train_y, pred_train) * 100, 2)
        print('acc(relative error) for training =', acc_train_d_num)  
        
        acc_train_rmse_num = round(acc_rmse(train_y, pred_train) * 100, 2)
        print('acc(rmse) for training =', acc_train_rmse_num) 
        
        print("# Test performance")
        acc_train_r2_num = round(r2_score(test_y, pred_y) * 100, 2)
        print('acc(r2_score) for testing =', acc_train_r2_num)  
        r2_scores.append(acc_train_r2_num)
        
        acc_train_d_num = round(acc_d(test_y, pred_y) * 100, 2)
        print('acc(relative error) for testing =', acc_train_d_num)  
        
        acc_train_rmse_num = round(acc_rmse(test_y, pred_y) * 100, 2)
        print('acc(rmse) for testing =', acc_train_rmse_num) 
        rmse.append(acc_train_rmse_num)        

    print("\nAvg R2 Score:", round(np.mean(r2_scores), 3))
    
    rmse_mean = np.mean(rmse) 
    print("Avg RMSE (normalized): {} & in $ value: {}".format(round(rmse_mean, 3), 
                                                              round(rmse_mean * car_price_mean / 100, 3)))

In [None]:
#added some parameters
# https://stackoverflow.com/a/45116022
k_fold_5 = KFold(n_splits = 5, shuffle = True, random_state = 2)

In [None]:
target_name = 'Price'
train_target = vehicle_top_price[target_name]

vehicle_top_price = vehicle_top_price.drop([target_name], axis=1)
vehicle_top_price.sample(5)

In [None]:
train0, test0, train_target0, test_target0 = train_test_split(vehicle_top_price, 
                                                              train_target, test_size=0.2, random_state=0)

In [None]:
# models = []
# SVR, 
for Model in [LinearRegression, LinearSVR, SGDRegressor, DecisionTreeRegressor, RandomForestRegressor, 
              BaggingRegressor, ExtraTreesRegressor, AdaBoostRegressor]:    
    print("\n# Training for {} starting ****".format(Model))
    
#     if Model == RidgeCV:
#         model = Model(cv=5)
#     else:
#         model = Model() 
    accuracy_model(k_fold_5, Model, train0, train_target0)
    
#     if Model == RandomForestRegressor:
#         print(model.best_params_)
#     models.append(model)

In [None]:
# mlp = MLPRegressor()
# param_grid = {'hidden_layer_sizes': [i for i in range(2,20)],
#               'activation': ['relu'],
#               'solver': ['adam'],
#               'learning_rate': ['constant'],
#               'learning_rate_init': [0.01],
#               'power_t': [0.5],
#               'alpha': [0.0001],
#               'max_iter': [1000],
#               'early_stopping': [True],
#               'warm_start': [False]}
# mlp_GS = GridSearchCV(mlp, param_grid=param_grid, 
#                    cv=10, verbose=True, pre_dispatch='2*n_jobs')
# accuracy_model(k_fold_5, mlp_GS, vehicle_top_price, train_target)

In [None]:
# Voting_Reg = VotingRegressor(estimators=[('lin', linreg), ('ridge', ridge), ('sgd', sgd)])

In [None]:
random_forest_regressor = RandomForestRegressor().fit(train0, train_target0)
pred_y = random_forest_regressor.predict(test0)

print("# Test performance")
acc_train_r2_num = round(r2_score(test_target0, pred_y) * 100, 2)
print('acc(r2_score) for testing =', acc_train_r2_num)  

acc_train_d_num = round(acc_d(test_target0, pred_y) * 100, 2)
print('acc(relative error) for testing =', acc_train_d_num)  

acc_train_rmse_num = round(acc_rmse(test_target0, pred_y) * 100, 2)
print('acc(rmse) for testing =', acc_train_rmse_num, "$", round(acc_train_rmse_num * car_price_mean / 100, 3)) 

### 3.6 pickle 

In [None]:
import pickle

In [None]:
pickle.dump(random_forest_regressor, open('r_model_random_forest_regressor.pkl','wb'))

In [None]:
model = pickle.load(open('r_model_random_forest_regressor.pkl.pkl','rb'))