## Predict Glass type using Linear SVM:

Table of contents :
1. Importing Libraries
2. Importing Dataset
3. Peek into Data
4. Split data into train and test dataframes.
5. Finding accuracy Linear SVM model
6. Performing correlation
7. Finding accuracy again

#### 1.Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, r2_score

#### 2.Importing Dataset

In [2]:
glass_df = pd.read_csv('glass.csv')

#### 3.Peek into Data
##### a.glass_df.columns.values -- gives the column names
##### b.glass_df.Type.value_counts() -- gives the unique count of Type column
##### c.glass_df.head() -- gives first five rows of data frame.
##### d.glass_df.describe() -- gives the mean, standard deviation, min and max values for int type features
##### e.glass_df.isnull().sum() -- gives the total number of null rows in a column

In [3]:
glass_df.columns.values

array(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'],
      dtype=object)

In [4]:
glass_df.Type.value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [5]:
glass_df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [6]:
glass_df.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


In [7]:
glass_df.isnull().sum()

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

#### 4.Split data into train and test dataframes
By using train_test_split data will be divided

In [8]:
glass_df_x = glass_df.drop(['Type'],axis = 1)
glass_df_y = glass_df['Type']

x_train,x_test,y_train,y_test = train_test_split(glass_df_x,glass_df_y,test_size = 0.2,random_state = 100)

print('Shape of training data')
print(x_train.shape,y_train.shape)
print('*'*50)
print('Shape of test data')
print(x_test.shape,y_test.shape)

Shape of training data
(171, 9) (171,)
**************************************************
Shape of test data
(43, 9) (43,)


#### 5.Finding accuracy Linear SVM model

In [9]:
model = SVC(kernel='linear',C=1).fit(x_train,y_train)
y_pred = model.predict(x_test)
print('confusion matrix : ' ,confusion_matrix(y_test,y_pred))
print('*'*50)
print('accuracy score :',round(accuracy_score(y_test,y_pred)*100,2),'%')
print('number of correctly classified samples: ' ,accuracy_score(y_test, y_pred, normalize=False))
print('*'*50)
print('classification report : ' ,classification_report(y_test,y_pred))

confusion matrix :  [[ 9  1  0  0  0  0]
 [ 8 11  0  0  0  0]
 [ 2  1  0  0  0  0]
 [ 0  0  0  4  0  1]
 [ 0  0  0  0  1  0]
 [ 0  1  0  0  0  4]]
**************************************************
accuracy score : 67.44 %
number of correctly classified samples:  29
**************************************************
classification report :                precision    recall  f1-score   support

           1       0.47      0.90      0.62        10
           2       0.79      0.58      0.67        19
           3       0.00      0.00      0.00         3
           5       1.00      0.80      0.89         5
           6       1.00      1.00      1.00         1
           7       0.80      0.80      0.80         5

    accuracy                           0.67        43
   macro avg       0.68      0.68      0.66        43
weighted avg       0.69      0.67      0.66        43



  _warn_prf(average, modifier, msg_start, len(result))


##### since the accuracy is low (67.44%) we will include only highest correlated features and try running model again

#### 6.Performing correlation 

In [10]:
print('Correlation betwween Ba and Type')
print(glass_df[['Ba','Type']].groupby(['Ba'],as_index = False).mean().sort_values(by = 'Ba', ascending = 'True'))
print('*'*50)
print('Correlation betwween Fe and Type')
print(glass_df[['Fe','Type']].groupby(['Fe'],as_index = False).mean().sort_values(by = 'Fe', ascending = 'True'))
print('*'*50)
print('Correlation betwween RI and Type')
print(glass_df[['RI','Type']].groupby(['RI'],as_index = False).mean().sort_values(by = 'RI', ascending = 'True'))
print('*'*50)
print('Correlation betwween Na and Type')
print(glass_df[['Na','Type']].groupby(['Na'],as_index = False).mean().sort_values(by = 'Na', ascending = 'True'))
print('*'*50)
print('Correlation betwween Mg and Type')
print(glass_df[['Mg','Type']].groupby(['Mg'],as_index = False).mean().sort_values(by = 'Mg', ascending = 'True'))

Correlation betwween Ba and Type
      Ba    Type
0   0.00  2.1875
1   0.06  2.0000
2   0.09  1.5000
3   0.11  1.5000
4   0.14  2.0000
5   0.15  3.0000
6   0.24  5.0000
7   0.27  2.0000
8   0.40  7.0000
9   0.53  7.0000
10  0.54  7.0000
11  0.56  7.0000
12  0.61  7.0000
13  0.63  7.0000
14  0.64  7.0000
15  0.66  7.0000
16  0.67  7.0000
17  0.69  1.0000
18  0.76  7.0000
19  0.81  7.0000
20  1.06  7.0000
21  1.19  7.0000
22  1.38  7.0000
23  1.55  7.0000
24  1.57  7.0000
25  1.59  7.0000
26  1.63  7.0000
27  1.64  7.0000
28  1.67  7.0000
29  1.68  7.0000
30  1.71  7.0000
31  2.20  5.0000
32  2.88  7.0000
33  3.15  2.0000
**************************************************
Correlation betwween Fe and Type
      Fe      Type
0   0.00  3.048611
1   0.01  7.000000
2   0.03  1.000000
3   0.05  7.000000
4   0.06  1.000000
5   0.07  3.000000
6   0.08  4.500000
7   0.09  3.666667
8   0.10  2.000000
9   0.11  1.000000
10  0.12  2.000000
11  0.14  1.666667
12  0.15  2.000000
13  0.16  1.000000
14 

#### We could see many null values in Ba and Fe , so we can remove those and will try running Linear SVM

In [11]:
glass_df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [12]:
glass_df_corr_x = glass_df.drop(['Type','Ba','Fe'],axis = 1)
glass_df_corr_y = glass_df['Type']

x_train,x_test,y_train,y_test = train_test_split(glass_df_corr_x,glass_df_corr_y,test_size = 0.2,random_state = 100)

print('Shape of training data after correlation')
print(x_train.shape,y_train.shape)
print('*'*50)
print('Shape of test data after correlation')
print(x_test.shape,y_test.shape)



Shape of training data after correlation
(171, 7) (171,)
**************************************************
Shape of test data after correlation
(43, 7) (43,)


#### 7.Finding Accuracy again

In [13]:
model = SVC(kernel='linear',C=1).fit(x_train,y_train)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print('confusion matrix after correlation: ' ,confusion_matrix(y_test,y_pred))
print('accuracy score after correlation:',round(accuracy_score(y_test,y_pred)*100,2),'%')
print('number of correctly classified samples: ' ,accuracy_score(y_test, y_pred, normalize=False))
print('*'*50)
print('classification report after correlation: ' ,classification_report(y_test,y_pred))

confusion matrix after correlation:  [[ 9  1  0  0  0  0]
 [ 8 11  0  0  0  0]
 [ 2  1  0  0  0  0]
 [ 0  1  0  3  0  1]
 [ 0  0  0  0  1  0]
 [ 0  1  0  0  0  4]]
accuracy score after correlation: 65.12 %
number of correctly classified samples:  28
**************************************************
classification report after correlation:                precision    recall  f1-score   support

           1       0.47      0.90      0.62        10
           2       0.73      0.58      0.65        19
           3       0.00      0.00      0.00         3
           5       1.00      0.60      0.75         5
           6       1.00      1.00      1.00         1
           7       0.80      0.80      0.80         5

    accuracy                           0.65        43
   macro avg       0.67      0.65      0.64        43
weighted avg       0.67      0.65      0.63        43



  _warn_prf(average, modifier, msg_start, len(result))


#### Accuracy score was slightly decreased to 65