Import the necessary libraries

In [157]:
import pandas as pd
import numpy as np
import copy as copy
from scipy.stats import zscore
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

Step1 :- Data pre-processing

In [120]:
#Read the csv file
vehicle_df = pd.read_csv('vehicle-1.csv')

In [121]:
#copy the vehicle data
vehicle_copy_df = copy.deepcopy(vehicle_df)

In [122]:
#Display the first five records from the vehicle file
vehicle_copy_df.head()

Unnamed: 0,compactness,circularity,distance_circularity,radius_ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter_ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio,class
0,95,48.0,83.0,178.0,72.0,10,162.0,42.0,20.0,159,176.0,379.0,184.0,70.0,6.0,16.0,187.0,197,van
1,91,41.0,84.0,141.0,57.0,9,149.0,45.0,19.0,143,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199,van
2,104,50.0,106.0,209.0,66.0,10,207.0,32.0,23.0,158,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196,car
3,93,41.0,82.0,159.0,63.0,9,144.0,46.0,19.0,143,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207,van
4,85,44.0,70.0,205.0,103.0,52,149.0,45.0,19.0,144,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183,bus


In [123]:
#Display the datatypes of each variable
vehicle_copy_df.dtypes

compactness                      int64
circularity                    float64
distance_circularity           float64
radius_ratio                   float64
pr.axis_aspect_ratio           float64
max.length_aspect_ratio          int64
scatter_ratio                  float64
elongatedness                  float64
pr.axis_rectangularity         float64
max.length_rectangularity        int64
scaled_variance                float64
scaled_variance.1              float64
scaled_radius_of_gyration      float64
scaled_radius_of_gyration.1    float64
skewness_about                 float64
skewness_about.1               float64
skewness_about.2               float64
hollows_ratio                    int64
class                           object
dtype: object

In [124]:
#print the shape of the dataset
vehicle_copy_df.shape

(846, 19)

In [125]:
#print the information about the dataset
vehicle_copy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
compactness                    846 non-null int64
circularity                    841 non-null float64
distance_circularity           842 non-null float64
radius_ratio                   840 non-null float64
pr.axis_aspect_ratio           844 non-null float64
max.length_aspect_ratio        846 non-null int64
scatter_ratio                  845 non-null float64
elongatedness                  845 non-null float64
pr.axis_rectangularity         843 non-null float64
max.length_rectangularity      846 non-null int64
scaled_variance                843 non-null float64
scaled_variance.1              844 non-null float64
scaled_radius_of_gyration      844 non-null float64
scaled_radius_of_gyration.1    842 non-null float64
skewness_about                 840 non-null float64
skewness_about.1               845 non-null float64
skewness_about.2               845 non-null float64
hollows_ratio    

In [126]:
#Check for the null values 
print(vehicle_copy_df.isna().sum())
print(vehicle_copy_df.isna().any())

compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64
compactness                    False
circularity                     True
distance_circularity            True
radius_ratio                    True
pr.axis_aspect_ratio            True
max.length_aspect_ratio        False
scatter_ratio                   True
elongatedness                   True
pr.axis_rectangularity          True
max.length_rectangularity  

Inference: Most of the features have null values contained. But the total count of null values is small.
So, the null values can be dropped.

In [127]:
#dropping the null values
vehicle_copy_df.dropna(axis=0,inplace=True)

In [128]:
#check the info after dropping null values
vehicle_copy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 813 entries, 0 to 845
Data columns (total 19 columns):
compactness                    813 non-null int64
circularity                    813 non-null float64
distance_circularity           813 non-null float64
radius_ratio                   813 non-null float64
pr.axis_aspect_ratio           813 non-null float64
max.length_aspect_ratio        813 non-null int64
scatter_ratio                  813 non-null float64
elongatedness                  813 non-null float64
pr.axis_rectangularity         813 non-null float64
max.length_rectangularity      813 non-null int64
scaled_variance                813 non-null float64
scaled_variance.1              813 non-null float64
scaled_radius_of_gyration      813 non-null float64
scaled_radius_of_gyration.1    813 non-null float64
skewness_about                 813 non-null float64
skewness_about.1               813 non-null float64
skewness_about.2               813 non-null float64
hollows_ratio    

In [129]:
#check for null values after dropping na
vehicle_copy_df.isna().sum()

compactness                    0
circularity                    0
distance_circularity           0
radius_ratio                   0
pr.axis_aspect_ratio           0
max.length_aspect_ratio        0
scatter_ratio                  0
elongatedness                  0
pr.axis_rectangularity         0
max.length_rectangularity      0
scaled_variance                0
scaled_variance.1              0
scaled_radius_of_gyration      0
scaled_radius_of_gyration.1    0
skewness_about                 0
skewness_about.1               0
skewness_about.2               0
hollows_ratio                  0
class                          0
dtype: int64

In [130]:
#print the five point summary of the data
vehicle_copy_df.describe()

Unnamed: 0,compactness,circularity,distance_circularity,radius_ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter_ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio
count,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0,813.0
mean,93.656827,44.803198,82.04305,169.098401,61.774908,8.599016,168.563346,40.98893,20.558426,147.891759,188.377614,438.382534,174.252153,72.399754,6.351784,12.687577,188.97909,195.729397
std,8.233751,6.146659,15.78307,33.615402,7.973,4.677174,33.082186,7.80338,2.573184,14.504648,31.165873,175.270368,32.332161,7.475994,4.921476,8.926951,6.153681,7.398781
min,73.0,33.0,40.0,104.0,47.0,2.0,112.0,26.0,17.0,118.0,130.0,184.0,109.0,59.0,0.0,0.0,176.0,181.0
25%,87.0,40.0,70.0,141.0,57.0,7.0,146.0,33.0,19.0,137.0,167.0,318.0,149.0,67.0,2.0,6.0,184.0,191.0
50%,93.0,44.0,79.0,167.0,61.0,8.0,157.0,43.0,20.0,146.0,179.0,364.0,173.0,71.0,6.0,11.0,189.0,197.0
75%,100.0,49.0,98.0,195.0,65.0,10.0,198.0,46.0,23.0,159.0,217.0,586.0,198.0,75.0,9.0,19.0,193.0,201.0
max,119.0,59.0,112.0,333.0,138.0,55.0,265.0,61.0,29.0,188.0,320.0,1018.0,268.0,135.0,22.0,41.0,206.0,211.0


Inference: Most of the columns have mean and median almost same except the scatter_ratio,scaled_variance and scaled_variance.1, which could also indicate presence of skewness.
The outliers and skewness will be checked further.

Outlier check:

In [131]:
#create a dataframe which has the lower and upper bound for all the features
columns = vehicle_copy_df.columns[:-1]
class1 = ['bus','car','van']
outlier_range_df = pd.DataFrame([])
lower_bound = []
upper_bound = []
features = []
veh_type = []
for i in columns:
    describe = vehicle_copy_df.groupby('class')[i].describe().reset_index()  
    for c in class1:
        first = int(describe[describe['class'] == c]['25%'].values)
        third = int(describe[describe['class'] == c]['75%'].values)
        IQR = third - first
        lower = first - (1.5 * IQR)
        upper = third + (1.5 * IQR)
        lower_bound.append(lower)
        upper_bound.append(upper)
        features.append(i)
        veh_type.append(c)
#combining all the values in the dataframe
outlier_range_df['features'] = features
outlier_range_df['veh_type'] = veh_type
outlier_range_df['lower_bound'] = lower_bound
outlier_range_df['upper_bound'] = upper_bound

In [132]:
#find the count of outliers correspond to each feature based on the class
columns = vehicle_copy_df.columns[:-1]
class1 = ['bus','car','van']
for i in columns:
    for c in class1:
        lower = float(outlier_range_df[(outlier_range_df['features'] == i) &
                                       (outlier_range_df['veh_type'] == c)]['lower_bound'].values)
        upper = float(outlier_range_df[(outlier_range_df['features'] == i) &
                                       (outlier_range_df['veh_type'] == c)]['upper_bound'].values)
        bool_lower = vehicle_copy_df[vehicle_copy_df['class'] == c][i] < lower
        index_lower = bool_lower[bool_lower == True].index
        
        bool_upper = vehicle_copy_df[vehicle_copy_df['class'] == c][i] > upper
        index_upper = bool_upper[bool_upper == True].index
                
        #bring the values that exceed the limits into the IQR boundary
        vehicle_copy_df.drop(index_lower,axis=0,inplace=True)
        vehicle_copy_df.drop(index_upper,axis=0,inplace=True)

In [133]:
X= vehicle_copy_df.drop('class',axis=1)
y = vehicle_copy_df['class']

In [134]:
y.value_counts()

car    373
van    186
bus    162
Name: class, dtype: int64

In [135]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.25, random_state=100)

In [136]:
train_xscaled = train_x.apply(zscore)
test_xscaled = test_x.apply(zscore)

In [140]:
logisticregression_lasso = SelectFromModel(LogisticRegression())

In [141]:
logisticregression_lasso.fit(train_xscaled, train_y)



SelectFromModel(estimator=LogisticRegression(C=1.0, class_weight=None,
                                             dual=False, fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='warn',
                                             n_jobs=None, penalty='l2',
                                             random_state=None, solver='warn',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [146]:
select_feat = logisticregression_lasso.get_support()

In [148]:
final_columns = train_xscaled.columns[select_feat]

In [149]:
final_columns

Index(['radius_ratio', 'pr.axis_aspect_ratio', 'elongatedness',
       'max.length_rectangularity', 'scaled_radius_of_gyration.1',
       'skewness_about.2'],
      dtype='object')

In [150]:
svc = SVC()
svc.fit(train_xscaled[final_columns], train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [151]:
svc.score(train_xscaled[final_columns], train_y)

0.9629629629629629

In [153]:
svc.score(test_xscaled[final_columns], test_y)

0.9116022099447514

In [154]:
predict_y = svc.predict(test_xscaled[final_columns])

In [158]:
confusion_matrix(test_y, predict_y)

array([[38,  1,  2],
       [ 1, 81,  3],
       [ 5,  4, 46]], dtype=int64)