In [1]:
#import warnings
#warnings.filterwarnings('ignore')

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

In [2]:
salary = pd.read_csv('SalaryData_Train(1).csv')
salary_test = pd.read_csv('SalaryData_Test(1).csv')
salary

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30156,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
30157,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
30158,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30159,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [3]:
salary.shape

(30161, 14)

In [4]:
salary.dtypes

age               int64
workclass        object
education        object
educationno       int64
maritalstatus    object
occupation       object
relationship     object
race             object
sex              object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
native           object
Salary           object
dtype: object

In [5]:
salary.isna().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

### Data Preparation

In [6]:
salary2 = salary.copy()

In [7]:
le = LabelEncoder()
salary2['workclass'] = le.fit_transform(salary2['workclass'])
salary2['education'] = le.fit_transform(salary2['education'])
salary2['maritalstatus'] = le.fit_transform(salary2['maritalstatus'])
salary2['occupation'] = le.fit_transform(salary2['occupation'])
salary2['relationship'] = le.fit_transform(salary2['relationship'])
salary2['race'] = le.fit_transform(salary2['race'])
salary2['sex'] = le.fit_transform(salary2['sex'])
salary2['native'] = le.fit_transform(salary2['native'])
salary2['Salary'] = le.fit_transform(salary2['Salary'])

salary_test['workclass'] = le.fit_transform(salary_test['workclass'])
salary_test['education'] = le.fit_transform(salary_test['education'])
salary_test['maritalstatus'] = le.fit_transform(salary_test['maritalstatus'])
salary_test['occupation'] = le.fit_transform(salary_test['occupation'])
salary_test['relationship'] = le.fit_transform(salary_test['relationship'])
salary_test['race'] = le.fit_transform(salary_test['race'])
salary_test['sex'] = le.fit_transform(salary_test['sex'])
salary_test['native'] = le.fit_transform(salary_test['native'])
salary_test['Salary'] = le.fit_transform(salary_test['Salary'])
salary2

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,9,13,4,0,1,4,1,2174,0,40,37,0
1,50,4,9,13,2,3,0,4,1,0,0,13,37,0
2,38,2,11,9,0,5,1,4,1,0,0,40,37,0
3,53,2,1,7,2,5,0,2,1,0,0,40,37,0
4,28,2,9,13,2,9,5,2,0,0,0,40,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30156,27,2,7,12,2,12,5,4,0,0,0,38,37,0
30157,40,2,11,9,2,6,0,4,1,0,0,40,37,1
30158,58,2,11,9,6,0,4,4,0,0,0,40,37,0
30159,22,2,11,9,4,0,3,4,1,0,0,20,37,0


### Model Building

In [8]:
x = salary2.drop(labels='Salary', axis=1)
y = salary2[['Salary']]
x_test = salary_test.drop(labels='Salary', axis=1)
y_test = salary_test[['Salary']]
y.shape, y_test.shape

((30161, 1), (15060, 1))

In [9]:
columns = x.columns
sc = StandardScaler()
x_scaled = sc.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=columns)
y2 = y.copy()
y2 = y2.values.ravel()

### Model Training | Testing | Evaluation

In [10]:
%%time
svm = SVC(kernel='linear')
svm.fit(x_scaled,y2)
y_pred = svm.predict(x_test)

print('Accuracy           :', round(accuracy_score(y_test,y_pred),4))
print('Precision          :', round(precision_score(y_test,y_pred),4))
print('Recall             :', round(recall_score(y_test,y_pred),4))
print('Confusion matrix   :\n', confusion_matrix(y_test,y_pred))

Accuracy           : 0.2457
Precision          : 0.2457
Recall             : 1.0
Confusion matrix   :
 [[    0 11360]
 [    0  3700]]
Wall time: 37.3 s


In [None]:
gridsearch = GridSearchCV(estimator  = svm,
                          param_grid = {'kernel' :['linear','rbf','poly'],
                                        'C'      :[0.1,0.5,1,2],
                                        'gamma'  :[0.1,0.01,0.5,1,2],
                                        'degree' :[1,3,5]},
                          cv         = 5)
gridsearch.fit(x_scaled,y2)

In [None]:
gridsearch.best_params_

#### Kernel : poly

In [None]:
%%time
svm = SVC(kernel='poly', C=2, degree=3, gamma=2)
svm.fit(x_scaled,y2)
y_pred = svm.predict(x_test)

print('Accuracy           :', round(accuracy_score(y_test,y_pred),4))
print('Precision          :', round(precision_score(y_test,y_pred),4))
print('Recall             :', round(recall_score(y_test,y_pred),4))
print('Confusion matrix   :\n', confusion_matrix(y_test,y_pred))

#### Kernel : rbf

In [14]:
%%time
svm = SVC(kernel='rbf', C=2, gamma=2)
svm.fit(x,y2)
y_pred = svm.predict(x_test)

print('Accuracy           :', round(accuracy_score(y_test,y_pred),4))
print('Precision          :', round(precision_score(y_test,y_pred),4))
print('Recall             :', round(recall_score(y_test,y_pred),4))
print('Confusion matrix   :\n', confusion_matrix(y_test,y_pred))

Accuracy           : 0.7627
Precision          : 0.6097
Recall             : 0.0954
Confusion matrix   :
 [[11134   226]
 [ 3347   353]]
Wall time: 6min 4s
