In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

from sklearn.svm import LinearSVC,SVC

from sklearn.model_selection import GridSearchCV

%matplotlib inline

### 1. Load the data from “college.csv” that has attributes collected about private and public colleges for a particular year. We will try to predict the private/public status of the college from other attributes 

In [14]:
data = pd.read_csv('College.csv')

In [15]:
data.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
Private        777 non-null object
Apps           777 non-null int64
Accept         777 non-null int64
Enroll         777 non-null int64
Top10perc      777 non-null int64
Top25perc      777 non-null int64
F.Undergrad    777 non-null int64
P.Undergrad    777 non-null int64
Outstate       777 non-null int64
Room.Board     777 non-null int64
Books          777 non-null int64
Personal       777 non-null int64
PhD            777 non-null int64
Terminal       777 non-null int64
S.F.Ratio      777 non-null float64
perc.alumni    777 non-null int64
Expend         777 non-null int64
Grad.Rate      777 non-null int64
dtypes: float64(1), int64(16), object(1)
memory usage: 109.3+ KB


In [17]:
data.shape

(777, 18)

In [18]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Apps,777.0,3001.638353,3870.201484,81.0,776.0,1558.0,3624.0,48094.0
Accept,777.0,2018.804376,2451.113971,72.0,604.0,1110.0,2424.0,26330.0
Enroll,777.0,779.972973,929.17619,35.0,242.0,434.0,902.0,6392.0
Top10perc,777.0,27.558559,17.640364,1.0,15.0,23.0,35.0,96.0
Top25perc,777.0,55.796654,19.804778,9.0,41.0,54.0,69.0,100.0
F.Undergrad,777.0,3699.907336,4850.420531,139.0,992.0,1707.0,4005.0,31643.0
P.Undergrad,777.0,855.298584,1522.431887,1.0,95.0,353.0,967.0,21836.0
Outstate,777.0,10440.669241,4023.016484,2340.0,7320.0,9990.0,12925.0,21700.0
Room.Board,777.0,4357.526384,1096.696416,1780.0,3597.0,4200.0,5050.0,8124.0
Books,777.0,549.380952,165.10536,96.0,470.0,500.0,600.0,2340.0


### 2. Use LabelEncoder to encode the target variable in to numerical form and split the data such that 20% of the data is set aside for testing. 

In [30]:
encoder = LabelEncoder()
data['Private'] = encoder.fit_transform(data['Private'])
data['Private'].value_counts()

1    565
0    212
Name: Private, dtype: int64

### 3. Fit a linear svm from scikit learn and observe the accuracy. [Hint: Use Linear SVC] 

In [31]:
x = data.drop(['Private'],axis=1)
y = data['Private']

x_train, x_test , y_train, y_test = train_test_split(x,y,train_size=0.8)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(621, 17)
(156, 17)
(621,)
(156,)


In [23]:
linear_svc = LinearSVC()

linear_svc = linear_svc.fit(x_train,y_train)
y_predict = linear_svc.predict(x_test)

print('Accuracy:',accuracy_score(y_test,y_predict))
print('classification report:', classification_report(y_test,y_predict))
print('confusion_matrix:', confusion_matrix(y_test,y_predict))

Accuracy: 0.8974358974358975
classification report:               precision    recall  f1-score   support

           0       0.93      0.64      0.76        39
           1       0.89      0.98      0.93       117

    accuracy                           0.90       156
   macro avg       0.91      0.81      0.85       156
weighted avg       0.90      0.90      0.89       156

confusion_matrix: [[ 25  14]
 [  2 115]]




### 4. Preprocess the data using StandardScalar and fit the same model again and observe the change in accuracy. [Hint: Refer to scikitlearn’s preprocessing methods]

In [32]:
x.columns

Index(['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad',
       'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD',
       'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate'],
      dtype='object')

In [34]:
scaler = StandardScaler()
x_scale = scaler.fit_transform(x)
x_scale = pd.DataFrame(x_scale,columns=['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad',
       'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD',
       'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate'])
x_scale.head()

Unnamed: 0,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,-0.346882,-0.321205,-0.063509,-0.258583,-0.191827,-0.168116,-0.209207,-0.746356,-0.964905,-0.602312,1.270045,-0.163028,-0.115729,1.013776,-0.867574,-0.50191,-0.318252
1,-0.210884,-0.038703,-0.288584,-0.655656,-1.353911,-0.209788,0.244307,0.457496,1.909208,1.21588,0.235515,-2.675646,-3.378176,-0.477704,-0.544572,0.16611,-0.551262
2,-0.406866,-0.376318,-0.478121,-0.315307,-0.292878,-0.549565,-0.49709,0.201305,-0.554317,-0.905344,-0.259582,-1.204845,-0.931341,-0.300749,0.585935,-0.17729,-0.667767
3,-0.668261,-0.681682,-0.692427,1.840231,1.677612,-0.658079,-0.520752,0.626633,0.996791,-0.602312,-0.688173,1.185206,1.175657,-1.615274,1.151188,1.792851,-0.376504
4,-0.726176,-0.764555,-0.780735,-0.655656,-0.596031,-0.711924,0.009005,-0.716508,-0.216723,1.518912,0.235515,0.204672,-0.523535,-0.553542,-1.675079,0.241803,-2.939613


In [35]:
x_train, x_test , y_train, y_test = train_test_split(x_scale,y,train_size=0.8)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(621, 17)
(156, 17)
(621,)
(156,)


In [37]:
linear_svc = LinearSVC()

linear_svc = linear_svc.fit(x_train,y_train)
y_predict = linear_svc.predict(x_test)

print('Accuracy:',accuracy_score(y_test,y_predict))
print('classification report:', classification_report(y_test,y_predict))
print('confusion_matrix:', confusion_matrix(y_test,y_predict))

Accuracy: 0.9423076923076923
classification report:               precision    recall  f1-score   support

           0       0.90      0.88      0.89        42
           1       0.96      0.96      0.96       114

    accuracy                           0.94       156
   macro avg       0.93      0.92      0.93       156
weighted avg       0.94      0.94      0.94       156

confusion_matrix: [[ 37   5]
 [  4 110]]




In [38]:
# The accuracy of the model has increased significantly from 0.89 to 0.94 after scaling the data. Thus in support vector classifier
# scaling the attributes plays an important role

### 5.Use scikit learn’s gridsearch to select the best hyperparameter for a non-linear SVM,identify the model with best score and its parameters.[Hint: Refer to model_selection module of Scikit learn] 

In [51]:
# BAse model without hyper parameter tuning:

SVC_base = SVC()

SVC_base = SVC_base.fit(x_train,y_train)
y_predict = SVC_base.predict(x_test)

print('Accuracy:',accuracy_score(y_test,y_predict))
print('classification report:', classification_report(y_test,y_predict))
print('confusion_matrix:', confusion_matrix(y_test,y_predict))

Accuracy: 0.9294871794871795
classification report:               precision    recall  f1-score   support

           0       0.92      0.81      0.86        42
           1       0.93      0.97      0.95       114

    accuracy                           0.93       156
   macro avg       0.93      0.89      0.91       156
weighted avg       0.93      0.93      0.93       156

confusion_matrix: [[ 34   8]
 [  3 111]]


In [42]:
param ={'C': [0.1, 1, 100, 1000],
        'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5],
        'kernel':['poly', 'rbf', 'sigmoid']
        }

grid = GridSearchCV(estimator=SVC(), param_grid = param , cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

grid_result = grid.fit(x_train, y_train)
best_params = grid_result.best_params_
pred = grid_result.predict(x_test)
cm = confusion_matrix(y_test, pred)

Fitting 3 folds for each of 84 candidates, totalling 252 fits


In [49]:
# best parameters:
best_params

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [44]:
pred

array([1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1], dtype=int64)

In [45]:
cm

array([[ 35,   7],
       [  4, 110]], dtype=int64)

In [48]:
# Building the svc model with best hyper parameters:
svc = SVC(C=1,gamma=0.1,kernel='rbf')

svc = svc.fit(x_train,y_train)
y_predict = svc.predict(x_test)

print('Accuracy:',accuracy_score(y_test,y_predict))
print('classification report:', classification_report(y_test,y_predict))
print('confusion_matrix:', confusion_matrix(y_test,y_predict))

Accuracy: 0.9294871794871795
classification report:               precision    recall  f1-score   support

           0       0.90      0.83      0.86        42
           1       0.94      0.96      0.95       114

    accuracy                           0.93       156
   macro avg       0.92      0.90      0.91       156
weighted avg       0.93      0.93      0.93       156

confusion_matrix: [[ 35   7]
 [  4 110]]


In [52]:
# Hyper parameter tuning increases the accuracy of the model