In [None]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt
%matplotlib inline

# Set default font size
plt.rcParams['font.size'] = 24

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns
sns.set(font_scale = 2)

In [None]:
# Read in data into a dataframe 
automobile = pd.read_csv('data/Automobile_dataset.csv')

# Display top of dataframe
automobile.head()

In [None]:
# See the column data types and non-missing values
automobile.info()

In [None]:
automobile.dtypes

In [None]:
# Statistics for each column
automobile.describe()

In [None]:
# calculate missing values
automobile.isnull().sum()

In [None]:
automobile.corr()

In [None]:
# Find out number of records having '?'
automobile['normalized-losses'].loc[automobile['normalized-losses'] == '?'].count()

In [None]:
# Setting the missing value to mean of normalized losses and conver the datatype to integer
nl = automobile['normalized-losses'].loc[automobile['normalized-losses'] != '?']
nl_mean = nl.astype(str).astype(int).mean()
automobile['normalized-losses'] = automobile['normalized-losses'].replace('?',nl_mean).astype(int)
automobile['normalized-losses'].head()

In [None]:
# Find out the number of values which are not numeric
automobile['price'].str.isnumeric().value_counts()

In [None]:
# List out the values which are not numeric
automobile['price'].loc[automobile['price'].str.isnumeric() == False]

In [None]:
#Setting the missing value to mean of price and convert the datatype to integer
price = automobile['price'].loc[automobile['price'] != '?']
pmean = price.astype(str).astype(int).mean()
automobile['price'] = automobile['price'].replace('?',pmean).astype(int)
automobile['price'].head()

In [None]:
# Checking the numberic and replacing with mean value and conver the datatype to integer
automobile['horsepower'].str.isnumeric().value_counts()
horsepower = automobile['horsepower'].loc[automobile['horsepower'] != '?']
hpmean = horsepower.astype(str).astype(int).mean()
automobile['horsepower'] = automobile['horsepower'].replace('?',hpmean).astype(int)
automobile['horsepower'].head()

In [None]:
#Checking the outlier of horsepower
automobile.loc[automobile['horsepower'] > 10000]

In [None]:
#Excluding the outlier data for horsepower
automobile[np.abs(automobile.horsepower-automobile.horsepower.mean())<=(3*automobile.horsepower.std())]

In [None]:
# Find out the number of invalid value
automobile['bore'].loc[automobile['bore'] == '?']

In [None]:
# Replace the non-numeric value to null and conver the datatype
automobile['bore'] = pd.to_numeric(automobile['bore'],errors='coerce')
automobile.dtypes

In [None]:
# Replace the non-number value to null and convert the datatype
automobile['stroke'] = pd.to_numeric(automobile['stroke'],errors='coerce')
automobile.dtypes

In [None]:
# Convert the non-numeric data to null and convert the datatype
automobile['peak-rpm'] = pd.to_numeric(automobile['peak-rpm'],errors='coerce')
automobile.dtypes

In [None]:
# remove the records which are having the value '?'
automobile['num-of-doors'].loc[automobile['num-of-doors'] == '?']
automobile = automobile[automobile['num-of-doors'] != '?']
automobile['num-of-doors'].loc[automobile['num-of-doors'] == '?']

In [None]:
#Number of vehicles by make
automobile.make.value_counts().nlargest(10).plot(kind='bar', figsize=(15,5))
plt.title("Number of vehicles by make")
plt.ylabel('Number of vehicles')
plt.xlabel('Make');
plt.show()

In [None]:
#Insurance risk ratings of vehicles
automobile.symboling.hist(bins=6,color='green');
plt.title("Insurance risk ratings of vehicles")
plt.ylabel('Number of vehicles')
plt.xlabel('Risk rating');
plt.show()

In [None]:
#Normalized losses of vehicles
automobile['normalized-losses'].hist(bins=5,color='orange');
plt.title("Normalized losses of vehicles")
plt.ylabel('Number of vehicles')
plt.xlabel('Normalized losses');
plt.show()

In [None]:
#Fuel type frequence diagram
automobile['fuel-type'].value_counts().plot(kind='bar',color='purple')
plt.title("Fuel type frequence diagram")
plt.ylabel('Number of vehicles')
plt.xlabel('Fuel type');
plt.show()

In [None]:
#Fuel type pie diagram
automobile['aspiration'].value_counts().plot.pie(figsize=(6, 6), autopct='%.2f')
plt.title("Fuel type pie diagram")
plt.ylabel('Number of vehicles')
plt.xlabel('Fuel type');
plt.show()

In [None]:
#Horse power histogram
automobile.horsepower[np.abs(automobile.horsepower-automobile.horsepower.mean())<=(3*automobile.horsepower.std())].hist(bins=5,color='red');
plt.title("Horse power histogram")
plt.ylabel('Number of vehicles')
plt.xlabel('Horse power');
plt.show()

In [None]:
#Curb weight histogram
automobile['curb-weight'].hist(bins=5,color='brown');
plt.title("Curb weight histogram")
plt.ylabel('Number of vehicles')
plt.xlabel('Curb weight');
plt.show()

In [None]:
#Drive wheels diagram
automobile['drive-wheels'].value_counts().plot(kind='bar',color='grey')
plt.title("Drive wheels diagram")
plt.ylabel('Number of vehicles')
plt.xlabel('Drive wheels');
plt.show()

In [None]:
#Number of doors frequency diagram
automobile['num-of-doors'].value_counts().plot(kind='bar',color='purple')
plt.title("Number of doors frequency diagram")
plt.ylabel('Number of vehicles')
plt.xlabel('Number of doors');
plt.show()

In [None]:
#Correlation Analysis
corr = automobile.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
plt.show()
a = sns.heatmap(corr, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

In [None]:
#Bivariate Analysis
plt.rcParams['figure.figsize']=(23,10)
ax = sns.boxplot(x="make", y="price", data=automobile)

In [None]:
#Scatter plot of price and engine size
g = sns.lmplot('price',"engine-size", automobile);

In [None]:
#Scatter plot of normalized losses and symboling
g = sns.lmplot('normalized-losses',"symboling", automobile);

In [None]:
#Scatter plot of Engine size and Peak RPM
plt.scatter(automobile['engine-size'],automobile['peak-rpm'])
plt.xlabel('Engine size')
plt.ylabel('Peak RPM');
plt.show()

In [None]:
#Scatter plot of City and Highway MPG, Curb weight based on Make of the car
g = sns.lmplot('city-mpg',"curb-weight", automobile, hue="make", fit_reg=False);

In [None]:
g = sns.lmplot('highway-mpg',"curb-weight", automobile, hue="make",fit_reg=False);

In [None]:
#Drive wheels and City MPG bar chart
automobile.groupby('drive-wheels')['city-mpg'].mean().plot(kind='bar', color = 'peru');
plt.title("Drive wheels City MPG")
plt.ylabel('City MPG')
plt.xlabel('Drive wheels');
plt.show()

In [None]:
#Drive wheels and Highway MPG bar chart
automobile.groupby('drive-wheels')['highway-mpg'].mean().plot(kind='bar', color = 'peru');
plt.title("Drive wheels Highway MPG")
plt.ylabel('Highway MPG')
plt.xlabel('Drive wheels');
plt.show()

In [None]:
#Boxplot of Drive wheels and Price
plt.rcParams['figure.figsize']=(10,5)
ax = sns.boxplot(x="drive-wheels", y="price", data=automobile)

In [None]:
#Normalized losses based on body style and number of doors
pd.pivot_table(automobile,index=['body-style','num-of-doors'], values='normalized-losses').plot(kind='bar',color='purple')
plt.title("Normalized losses based on body style and no. of doors")
plt.ylabel('Normalized losses')
plt.xlabel('Body style and No. of doors');
plt.show()