# Analysis of Vehicles dataset (Beginner's Analysis)

1.2 Million Used Car Listings
1.2 Million listings scraped from TrueCar.com - Price, Mileage, Make, Model

link: https://www.kaggle.com/jpayne/852k-used-car-listings

In [None]:
import sys
print(sys.executable)

## 1. First we import necessary Libaries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

## 2.Reading and Exploring the Data

### Load Vehicles Data

In [None]:
vehicles = pd.read_csv("datasets/true_car_listings.csv")
vehicles.info()

In [None]:
vehicles.sample(5)

In [None]:
vehicles.describe()

In [None]:
vehicles.shape

## 3 Dataset for car from 1970 and price is not greater than 50k

### 3.1 Data Analysis

In [None]:
vehicle_top_price = vehicles.loc[(vehicles.Year>=1970) & (vehicles.Price>=1000) & (vehicles.Price<=50000) 
                                 & (vehicles.Mileage<=300000)].loc[:, ['Price', 'Year', 
                                                                                'Mileage', 'City', 'State', 'Make', 'Model']]
vehicle_top_price.shape

In [None]:
vehicle_top_price.sample(5)

In [None]:
vehicle_top_price.describe()

In [None]:
vehicle_top_price.dtypes

### 3.2 Cleaning of data

In [None]:
vehicle_top_price.isnull().any()

In [None]:
vehicle_top_price.isnull().sum()

In [None]:
# lets drop null rows
vehicle_top_price = vehicle_top_price.dropna()

In [None]:
vehicle_top_price.shape

### 3.3 Understand the relation between Price and the number of Class

In [None]:
# vehicle_top_price['Price'].plot.hist(figsize=(15,5), bins=7, alpha=0.7, rwidth=0.5, grid=True)
num_class = 10
vehicle_top_price['Price'].plot.hist(figsize=(15,5), bins=num_class, alpha=0.7, rwidth=0.5, grid=True)
# vehicle_top_price['Price'].plot.hist(figsize=(15,5), bins=12, alpha=0.7, rwidth=0.5, grid=True)

### 3.4 EDA

In [None]:
import pandas_profiling as pp

In [None]:
# profile = pp.ProfileReport(vehicle_top_price, title='Pandas Profiling Report', explorative=True)
# profile.to_widgets()
# # profile.to_notebook_iframe()

### 3.3 Normalize the Data
Used Cars Price Prediction by 15 models
https://www.kaggle.com/vbmokin/used-cars-price-prediction-by-15-models

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# https://stackoverflow.com/a/53028100 
vehicle_top_price['class'] = pd.cut(vehicle_top_price.Price, bins=num_class, labels=np.arange(num_class), right=False)
vehicle_top_price = vehicle_top_price.drop(['Price'], axis=1)

In [None]:
car_year_min = vehicle_top_price['Year'].min()
vehicle_top_price['Year'] = (vehicle_top_price['Year'] - car_year_min).astype(int)

In [None]:
car_mileage_mean = vehicle_top_price['Mileage'].mean()
vehicle_top_price['Mileage'] = (vehicle_top_price['Mileage'] / car_mileage_mean).astype('float64')

In [None]:
# # perform one hot encoding on multiple categorical columns: https://datascience.stackexchange.com/a/71805
# # https://stackoverflow.com/a/44601764
# vehicle_top_price = pd.get_dummies(vehicle_top_price, columns=['City', 'State', 'Make', 'Model'], drop_first=True)
# vehicle_top_price.columns

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
for col in ['City', 'State', 'Make', 'Model']:
    le = LabelEncoder()
    le.fit(list(vehicle_top_price[col].astype(str).values))
    vehicle_top_price[col] = le.transform(list(vehicle_top_price[col].astype(str).values))
    le.get_params()

In [None]:
vehicle_top_price.sample(5)

In [None]:
vehicle_top_price.info()

In [None]:
vehicle_top_price.corr()

In [None]:
vehicle_top_price.describe()

In [None]:
vehicle_top_price.groupby('class').size()

### 3.4 EDA

In [None]:
# profile_n = pp.ProfileReport(vehicle_top_price, title='Pandas Profiling Report')
# profile_n.to_widgets()

### 3.5 Classification  

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split

from sklearn.metrics import accuracy_score, balanced_accuracy_score

# models
from sklearn.linear_model import LinearRegression, SGDRegressor, RidgeCV
from sklearn.svm import SVC

from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
def accuracy_model(clf, train_x, test_x, train_y, test_y):    
    clf.fit(train_x, train_y)

    print("\n### training performance")
    pred_train = clf.predict(train_x)
    acc_train_r2_num = round(balanced_accuracy_score(train_y, pred_train) * 100, 2)
    print('accuracy for training =', acc_train_r2_num)

    print("### Test performance")    
    pred_y = clf.predict(test_x)
    acc_train_r2_num = round(accuracy_score(test_y, pred_y) * 100, 2)
    print('Accuracy score for testing =', acc_train_r2_num)  
    acc_train_r2_num = round(balanced_accuracy_score(test_y, pred_y) * 100, 2)
    print('Balanced accuracy score for testing =', acc_train_r2_num)  
    
    return acc_train_r2_num

In [None]:
#added some parameters
# https://stackoverflow.com/a/45116022
k_fold_5 = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2)

In [None]:
target_name = 'class'
train_target = vehicle_top_price[target_name]

vehicle_top_price = vehicle_top_price.drop([target_name], axis=1)
vehicle_top_price.sample(5)

In [None]:
vehicle_top_price = StandardScaler().fit_transform(vehicle_top_price)
train0, test0, train_target0, test_target0 = train_test_split(vehicle_top_price, train_target, 
                                                              stratify=train_target[:], 
                                                              test_size=0.2, random_state=0)

In [None]:
uniqueValues, occurCount = np.unique(train_target0, return_counts=True)
uniqueValues, occurCount

Class: [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11] <br>

    Dataset Dist:  [21160, 70933, 135175, 163852, 128996, 90037, 74626, 55841, 38631, 23406, 15236, 12166]
    Training Dist: [16928, 56746, 108140, 131081, 103197, 72029, 59701, 44673, 30905, 18725, 12189,  9733]

In [None]:
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

names = ["Nearest Neighbors", 
#          "Linear SVM", "RBF SVM", 
#          "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
#     SVC(kernel="linear", C=0.025),
#     SVC(gamma=2, C=1),
#     GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

print("# Number of Class:", num_class)
for name, clf in zip(names, classifiers):
    print("\n## Training for {} starting ****".format(name))
    accuracy_scores = []
    for train_index, test_index in k_fold_5.split(train0):
        train_x = train0[train_index][:]
        test_x  = train0[test_index][:]

        train_y =  train_target0.iloc[train_index][:]
        test_y  =  train_target0.iloc[test_index][:]

        accuracy_scores.append(accuracy_model(clf, train_x, test_x, train_y, test_y))
    print("\nAvg Accuracy Score:", round(np.mean(accuracy_scores), 3))

### 3.5.1 yellowbrick

In [None]:
from yellowbrick.classifier import ClassPredictionError

In [None]:
# Instantiate the classification model and visualizer
visualizer = ClassPredictionError(KNeighborsClassifier(3), classes=np.arange(num_class))

# Fit the training data to the visualizer
visualizer.fit(train0, train_target0)

# Evaluate the model on the test data
visualizer.score(test0, test_target0)

# Draw visualization
visualizer.show()