## Importing the necessary libraries

In [1]:
# SVM Classification
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

## Data Collection

In [2]:
salary_train = pd.read_csv('SalaryData_Train(1).csv')
salary_test = pd.read_csv('SalaryData_Test(1).csv')

In [3]:
salary_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
salary_train.shape

(18645, 14)

In [5]:
salary_test.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [6]:
salary_test.shape

(15060, 14)

In [7]:
salary_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18645 entries, 0 to 18644
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            18645 non-null  int64 
 1   workclass      18645 non-null  object
 2   education      18645 non-null  object
 3   educationno    18645 non-null  int64 
 4   maritalstatus  18645 non-null  object
 5   occupation     18645 non-null  object
 6   relationship   18645 non-null  object
 7   race           18645 non-null  object
 8   sex            18645 non-null  object
 9   capitalgain    18645 non-null  int64 
 10  capitalloss    18645 non-null  int64 
 11  hoursperweek   18645 non-null  int64 
 12  native         18645 non-null  object
 13  Salary         18644 non-null  object
dtypes: int64(5), object(9)
memory usage: 2.0+ MB


In [8]:
salary_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15060 entries, 0 to 15059
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            15060 non-null  int64 
 1   workclass      15060 non-null  object
 2   education      15060 non-null  object
 3   educationno    15060 non-null  int64 
 4   maritalstatus  15060 non-null  object
 5   occupation     15060 non-null  object
 6   relationship   15060 non-null  object
 7   race           15060 non-null  object
 8   sex            15060 non-null  object
 9   capitalgain    15060 non-null  int64 
 10  capitalloss    15060 non-null  int64 
 11  hoursperweek   15060 non-null  int64 
 12  native         15060 non-null  object
 13  Salary         15060 non-null  object
dtypes: int64(5), object(9)
memory usage: 1.6+ MB


In [9]:
salary_train.describe()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
count,18645.0,18645.0,18645.0,18645.0,18645.0
mean,38.490695,10.124001,1073.642692,88.679539,40.900885
std,13.179845,2.530582,7374.919691,403.399003,11.906083
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,47.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [10]:
salary_test.describe()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
count,15060.0,15060.0,15060.0,15060.0,15060.0
mean,38.768327,10.112749,1120.301594,89.041899,40.951594
std,13.380676,2.558727,7703.181842,406.283245,12.062831
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,3770.0,99.0


In [11]:
# Merging Train and Test Data
df = salary_train.append(salary_test)
df.reset_index(inplace=True,drop=True)
df

  df = salary_train.append(salary_test)


Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33700,33,Private,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
33701,39,Private,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
33702,38,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
33703,44,Private,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [12]:
df.shape

(33705, 14)

# EDA

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33705 entries, 0 to 33704
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            33705 non-null  int64 
 1   workclass      33705 non-null  object
 2   education      33705 non-null  object
 3   educationno    33705 non-null  int64 
 4   maritalstatus  33705 non-null  object
 5   occupation     33705 non-null  object
 6   relationship   33705 non-null  object
 7   race           33705 non-null  object
 8   sex            33705 non-null  object
 9   capitalgain    33705 non-null  int64 
 10  capitalloss    33705 non-null  int64 
 11  hoursperweek   33705 non-null  int64 
 12  native         33705 non-null  object
 13  Salary         33704 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.6+ MB


In [14]:
df.describe()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
count,33705.0,33705.0,33705.0,33705.0,33705.0
mean,38.614746,10.118973,1094.490728,88.841448,40.923543
std,13.270476,2.543165,7523.286548,404.684302,11.976223
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,47.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [15]:
df.isna().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           1
dtype: int64

In [16]:
df.dtypes

age               int64
workclass        object
education        object
educationno       int64
maritalstatus    object
occupation       object
relationship     object
race             object
sex              object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
native           object
Salary           object
dtype: object

In [17]:
#X_train = df.iloc[:salary_train.shape[0], :-1]
#X_train

In [18]:
#Y_train = df.iloc[:salary_train.shape[0], -1]
#Y_train

## Data Preprocessing

In [19]:
numerical_features = df.select_dtypes(include = 'number').columns
numerical_features

# Standardization
SS = StandardScaler()

df[numerical_features] = SS.fit_transform(df[numerical_features])

In [20]:
categorical_features = df.select_dtypes(include = 'object').columns
categorical_features

# Label Encoding
LE = LabelEncoder()
for feature in categorical_features:
  df[feature] = LE.fit_transform(df[feature])

## Splitting the data as **X** and **Y**

In [21]:
X = df.drop('Salary', axis = 1)
X

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,0.029031,5,9,1.132868,4,0,1,4,1,0.143491,-0.219536,-0.077116,38
1,0.857951,4,9,1.132868,2,3,0,4,1,-0.145483,-0.219536,-2.331616,38
2,-0.046325,2,11,-0.439999,0,5,1,4,1,-0.145483,-0.219536,-0.077116,38
3,1.084021,2,1,-1.226432,2,5,0,2,1,-0.145483,-0.219536,-0.077116,38
4,-0.799889,2,9,1.132868,2,9,5,2,0,-0.145483,-0.219536,-0.077116,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33700,-0.423107,2,9,1.132868,4,9,3,4,1,-0.145483,-0.219536,-0.077116,38
33701,0.029031,2,9,1.132868,0,9,1,4,0,-0.145483,-0.219536,-0.411116,38
33702,-0.046325,2,9,1.132868,2,9,0,4,1,-0.145483,-0.219536,0.757884,38
33703,0.405813,2,9,1.132868,0,0,3,1,1,0.579610,-0.219536,-0.077116,38


In [22]:
Y = df['Salary']
Y

0        0
1        0
2        0
3        0
4        0
        ..
33700    0
33701    0
33702    0
33703    0
33704    1
Name: Salary, Length: 33705, dtype: int64

## Data Partition

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

# Building SVC models **without** Hyperparameter Tuning


## 1.Radial Basis Function (RBF)

In [24]:
model_rbf = SVC(kernel = 'rbf')
model_rbf.fit(X_train, Y_train)

In [25]:
Y_train_pred = model_rbf.predict(X_train)
Y_test_pred = model_rbf.predict(X_test)

In [26]:
print('Training accuracy :', accuracy_score(Y_train, Y_train_pred).round(3))
print('Test Accuracy :', accuracy_score(Y_test, Y_test_pred).round(3))

Training accuracy : 0.802
Test Accuracy : 0.799


## 2.Linear

In [27]:
model_linear = SVC(kernel = 'linear')
model_linear.fit(X_train, Y_train)

In [28]:
Y_train_pred = model_linear.predict(X_train)
Y_test_pred = model_linear.predict(X_test)

In [29]:
print('Training accuracy :', accuracy_score(Y_train, Y_train_pred).round(3))
print('Test Accuracy :', accuracy_score(Y_test, Y_test_pred).round(3))

Training accuracy : 0.807
Test Accuracy : 0.804


## 3. Polynomial

In [30]:
model_poly = SVC(kernel = 'poly')
model_poly.fit(X_train, Y_train)

In [31]:
Y_train_pred = model_poly.predict(X_train)
Y_test_pred = model_poly.predict(X_test)

In [32]:
print('Training accuracy :', accuracy_score(Y_train, Y_train_pred).round(3))
print('Test Accuracy :', accuracy_score(Y_test, Y_test_pred).round(3))

Training accuracy : 0.808
Test Accuracy : 0.807


## 4.Sigmoid

In [33]:
model_sigmoid = SVC(kernel = 'sigmoid')
model_sigmoid.fit(X_train, Y_train)

In [34]:
Y_train_pred = model_sigmoid.predict(X_train)
Y_test_pred = model_sigmoid.predict(X_test)

In [35]:
print('Training accuracy :', accuracy_score(Y_train, Y_train_pred).round(3))
print('Test Accuracy :', accuracy_score(Y_test, Y_test_pred).round(3))

Training accuracy : 0.759
Test Accuracy : 0.765


# Building SVC model **with** Hyperparameter Tuning

##1. Hyperparameter Tuning : **Kernel**, **C**, **Gamma**, **Degree**

In [None]:
svm_model = SVC(random_state = 42)

# Define the hyperparameter grid
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.1, 0.5, 1, 10, 5, 50],
    'degree': [2, 3, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator = svm_model, param_grid = param_grid, cv=3, scoring = 'accuracy', n_jobs = -1, verbose = 1)
grid_search.fit(X_train, Y_train)

In [None]:
# Print the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:
optimal_params = list(best_params.values())
optimal_params

In [None]:
optimal_C = optimal_params[0]
optimal_degree = optimal_params[1]
optimal_gamma = optimal_params[2]
optimal_kernel = optimal_params[3]

In [None]:
optimal_dict = {'Parameters' : ['C', 'Degree', 'Gamma', 'Kernel'],
                'Opt.Values' : [optimal_C, optimal_degree, optimal_gamma, optimal_kernel,]}

optimal_parameters = pd.DataFrame(optimal_dict)
optimal_parameters

## 2.Building SVC model with optimum values of Hyperparameters

In [None]:
svc = SVC(kernel = optimal_kernel,
          degree = optimal_degree,
          gamma = optimal_gamma,
          C = optimal_C,
          random_state = 42
)

svc.fit(X_train, Y_train)

In [None]:
Y_train_pred = svc.predict(X_train)
Y_test_pred = svc.predict(X_test)

In [None]:
print('Training accuracy :', accuracy_score(Y_train, Y_train_pred).round(3))
print('Test Accuracy :', accuracy_score(Y_test, Y_test_pred).round(3))

## Model Predictions