# Analysis of Vehicles dataset (Beginner's Analysis)

1.2 Million Used Car Listings
1.2 Million listings scraped from TrueCar.com - Price, Mileage, Make, Model

link: https://www.kaggle.com/jpayne/852k-used-car-listings

In [None]:
import sys
print(sys.executable)

## 1. First we import necessary Libaries

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 

## 2.Reading and Exploring the Data

### Load Vehicles Data

In [None]:
vehicles = pd.read_csv("datasets/true_car_listings.csv")
vehicles.info()

In [None]:
vehicles.sample(5)

In [None]:
vehicles.describe()

In [None]:
vehicles.shape

## 3.Cleaning of data

In [None]:
vehicles.isnull().any()

In [None]:
vehicles.isnull().sum()

In [None]:
# # lets drop null rows
# vehicles=vehicles.dropna()

## 4.Data Analysis

In [None]:
vehicle_top_price =  vehicles.loc[(vehicles.Year>=1970) & (vehicles.Price>=1000) & (vehicles.Price<=50000)].sort_values(by=['Price'], ascending=False)
vehicle_top_price.head(5)

In [None]:
vehicle_top_price.shape

In [None]:
vehicle_top_price.describe()

## 5. Data Visualization
https://clrife.github.io/

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(vehicle_top_price)

In [None]:
# https://stackoverflow.com/a/31047434
vehicles['Price'].plot.hist(figsize=(15,5), bins=100, alpha=0.7, rwidth=0.5, grid=True)

In [None]:
vehicle_top_price['Price'].plot.hist(figsize=(15,5), bins=51, alpha=0.7, rwidth=0.5, grid=True)

In [None]:
vehicle_top_price['Year'].plot.hist(figsize=(15,5), bins=22, alpha=0.9, rwidth=0.5, grid=True)

In [None]:
# https://stackoverflow.com/a/48238312
# df.groupby('year').case_status.value_counts().unstack(0).plot.barh()
vehicle_top_price.groupby('Year').Price.count().plot.bar(figsize=(15,5), grid=True)

In [None]:
df = vehicle_top_price.groupby('year')

In [None]:
# https://towardsdatascience.com/understanding-boxplots-5e2df7bcbd51
# df = pd.DataFrame(vehicle_top_price, columns=['year', 'price'], index=vehicle_top_price.index)
# grouped = df.groupby('year')
# # grouped.first()
# grouped.boxplot(vert=False, column='price', figsize=(15,15), grid=True, subplots=False)

vehicle_top_price.boxplot(column = 'Price', by = 'Year', vert=False, figsize=(15,15), grid=True)

In [None]:
# vehicle_top_price.boxplot(column = 'price', by = ['year', 'manufacturer'], vert=False, figsize=(15,50), grid=True)

## 6.1 Cleaning of data

In [None]:
vehicle_top_price = vehicles.loc[(vehicles.Year>=1970) & (vehicles.Price>=1000) & (vehicles.Price<=50000)].loc[:, ['Price', 'Mileage', 'Year']]

In [None]:
vehicle_top_price.sample(5)

In [None]:
vehicle_top_price.isnull().any()

In [None]:
vehicle_top_price.isnull().sum()

In [None]:
vehicle_top_price=vehicle_top_price.dropna()

In [None]:
vehicle_top_price.shape

In [None]:
vehicle_top_price['Price'] = vehicle_top_price['Price'] / vehicle_top_price['Price'].mean()

In [None]:
vehicle_top_price.sample(5)

## 6.2 Regression 

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

In [None]:
#added some parameters
# https://stackoverflow.com/a/45116022
kf5 = KFold(n_splits = 5, shuffle = True, random_state = 2)

In [None]:
scores, RMSE = [], []
for train_index, test_index in kf5.split(vehicle_top_price):
#     train_x = np.array(vehicle_top_price.iloc[train_index]['odometer']).reshape(-1, 1)
#     test_x  = np.array(vehicle_top_price.iloc[test_index]['odometer']]).reshape(-1, 1)
    
    train_x = np.array(vehicle_top_price.iloc[train_index][['Mileage', 'Year']])
    test_x  = np.array(vehicle_top_price.iloc[test_index][['Mileage', 'Year']])
    
    train_y =  vehicle_top_price.iloc[train_index]['Price']
    test_y  =  vehicle_top_price.iloc[test_index]['Price']
    
    reg = LinearRegression().fit(train_x, train_y)
    print(reg.score(train_x, train_y), reg.coef_, reg.intercept_)
    
    pred_y = reg.predict(test_x)
    
    rmse = np.sqrt(np.mean((pred_y-test_y)**2))
    RMSE.append(rmse)
    
    score = reg.score(test_x, test_y)
    scores.append(score)
    print(score, rmse)
    
print("Avg score:", round(np.mean(scores)*100, 3))
print("Avg RMSE:", round(np.mean(RMSE), 3))