In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


In [2]:
import pandas as pd
X_train = pd.read_csv("SalaryData_Train.csv",encoding = "ISO-8859-1")   ##Train data
X_test = pd.read_csv("SalaryData_Test.csv",encoding = "ISO-8859-1")     ##Test data

In [3]:
df = pd.concat([X_train,X_test])

# Explore Pre-Processing

In [4]:
df.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
#the main vision here is we don't need to label encoding the education column the number is already mentioned in it
#To capture the relation we have to see correlation matrix.

In [6]:
df.shape

(45221, 14)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45221 entries, 0 to 15059
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            45221 non-null  int64 
 1   workclass      45221 non-null  object
 2   education      45221 non-null  object
 3   educationno    45221 non-null  int64 
 4   maritalstatus  45221 non-null  object
 5   occupation     45221 non-null  object
 6   relationship   45221 non-null  object
 7   race           45221 non-null  object
 8   sex            45221 non-null  object
 9   capitalgain    45221 non-null  int64 
 10  capitalloss    45221 non-null  int64 
 11  hoursperweek   45221 non-null  int64 
 12  native         45221 non-null  object
 13  Salary         45221 non-null  object
dtypes: int64(5), object(9)
memory usage: 5.2+ MB


In [8]:
df.isnull().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [9]:
df1 = df.copy()

In [10]:
# display categorical variables
categorical = [col for col in X_train.columns if X_train[col].dtypes == 'O']
categorical

['workclass',
 'education',
 'maritalstatus',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native',
 'Salary']

In [11]:
#Above col have more unique variable then it will form more number of column so we apply encoding technique on it
#and extract some features
#Education column aready encoded so discard it

In [12]:
df1 = df1.drop(["education"],axis = 1)

In [13]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45221 entries, 0 to 15059
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            45221 non-null  int64 
 1   workclass      45221 non-null  object
 2   educationno    45221 non-null  int64 
 3   maritalstatus  45221 non-null  object
 4   occupation     45221 non-null  object
 5   relationship   45221 non-null  object
 6   race           45221 non-null  object
 7   sex            45221 non-null  object
 8   capitalgain    45221 non-null  int64 
 9   capitalloss    45221 non-null  int64 
 10  hoursperweek   45221 non-null  int64 
 11  native         45221 non-null  object
 12  Salary         45221 non-null  object
dtypes: int64(5), object(8)
memory usage: 4.8+ MB


In [14]:
import category_encoders as ce
# encode remaining variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['workclass',
 'maritalstatus',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native'])
df1 = encoder.fit_transform(df1)

  elif pd.api.types.is_categorical(cols):


In [15]:
df1.head()

Unnamed: 0,age,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,educationno,maritalstatus_1,...,native_32,native_33,native_34,native_35,native_36,native_37,native_38,native_39,native_40,Salary
0,39,1,0,0,0,0,0,0,13,1,...,0,0,0,0,0,0,0,0,0,<=50K
1,50,0,1,0,0,0,0,0,13,0,...,0,0,0,0,0,0,0,0,0,<=50K
2,38,0,0,1,0,0,0,0,9,0,...,0,0,0,0,0,0,0,0,0,<=50K
3,53,0,0,1,0,0,0,0,7,0,...,0,0,0,0,0,0,0,0,0,<=50K
4,28,0,0,1,0,0,0,0,13,0,...,0,0,0,0,0,0,0,0,0,<=50K


In [16]:
X=df1.drop(['Salary'], axis=1)
y =df1['Salary']

In [17]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=20)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 10 best features

              Specs         Score
43      capitalgain  1.130196e+08
44      capitalloss  1.853151e+06
0               age  1.151555e+04
45     hoursperweek  8.221133e+03
10  maritalstatus_2  4.811024e+03
31   relationship_2  4.327760e+03
8       educationno  3.225929e+03
9   maritalstatus_1  3.126615e+03
33   relationship_4  1.913133e+03
17     occupation_2  1.716744e+03
42            sex_2  1.420945e+03
19     occupation_4  1.317871e+03
30   relationship_1  1.282016e+03
20     occupation_5  1.106025e+03
34   relationship_5  8.768964e+02
6       workclass_6  8.279557e+02
11  maritalstatus_3  6.987953e+02
41            sex_1  6.839639e+02
32   relationship_3  6.355799e+02
16     occupation_1  3.687133e+02


In [18]:
#As we see in top 20 these are the best feature from data base after one hot encoding 
#So wasting of analysis time we drop the unnecessary column are they:
#education already labelled feature
#race column
#native

In [19]:
df

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15055,33,Private,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K
15056,39,Private,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
15057,38,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
15058,44,Private,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [20]:
df=df.drop(['education','race','native'],axis =1)

In [21]:
df.head()

Unnamed: 0,age,workclass,educationno,maritalstatus,occupation,relationship,sex,capitalgain,capitalloss,hoursperweek,Salary
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,Male,2174,0,40,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,Male,0,0,13,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,Male,0,0,40,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Male,0,0,40,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Female,0,0,40,<=50K


In [22]:
# display categorical variables
categorical = [col for col in df.columns if df[col].dtypes == 'O']
categorical

['workclass', 'maritalstatus', 'occupation', 'relationship', 'sex', 'Salary']

In [23]:
#apply OneHotEncoding


In [24]:
# encode remaining variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['workclass','maritalstatus', 'occupation', 'relationship', 'sex'])
df = encoder.fit_transform(df)

  elif pd.api.types.is_categorical(cols):


In [25]:
df.head()

Unnamed: 0,age,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,educationno,maritalstatus_1,...,relationship_3,relationship_4,relationship_5,relationship_6,sex_1,sex_2,capitalgain,capitalloss,hoursperweek,Salary
0,39,1,0,0,0,0,0,0,13,1,...,0,0,0,0,1,0,2174,0,40,<=50K
1,50,0,1,0,0,0,0,0,13,0,...,0,0,0,0,1,0,0,0,13,<=50K
2,38,0,0,1,0,0,0,0,9,0,...,0,0,0,0,1,0,0,0,40,<=50K
3,53,0,0,1,0,0,0,0,7,0,...,0,0,0,0,1,0,0,0,40,<=50K
4,28,0,0,1,0,0,0,0,13,0,...,1,0,0,0,0,1,0,0,40,<=50K


In [26]:
X = df1.drop(['Salary'],axis = 1)
y = df1['Salary']

In [27]:
cols = X.columns

In [28]:
#To get top feature apply KBestFeature method
#apply SelectKBest class to extract top 12 best features
bestfeatures = SelectKBest(score_func=chi2, k=21)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(21,'Score'))  #print 26 best features

              Specs         Score
43      capitalgain  1.130196e+08
44      capitalloss  1.853151e+06
0               age  1.151555e+04
45     hoursperweek  8.221133e+03
10  maritalstatus_2  4.811024e+03
31   relationship_2  4.327760e+03
8       educationno  3.225929e+03
9   maritalstatus_1  3.126615e+03
33   relationship_4  1.913133e+03
17     occupation_2  1.716744e+03
42            sex_2  1.420945e+03
19     occupation_4  1.317871e+03
30   relationship_1  1.282016e+03
20     occupation_5  1.106025e+03
34   relationship_5  8.768964e+02
6       workclass_6  8.279557e+02
11  maritalstatus_3  6.987953e+02
41            sex_1  6.839639e+02
32   relationship_3  6.355799e+02
16     occupation_1  3.687133e+02
18     occupation_3  3.630119e+02


# FEATURE SCALING

In [29]:
# Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [30]:
X= pd.DataFrame(X, columns=[cols])

In [31]:

X

Unnamed: 0,age,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,educationno,maritalstatus_1,...,native_31,native_32,native_33,native_34,native_35,native_36,native_37,native_38,native_39,native_40
0,0.034190,4.715710,-0.302714,-1.671915,-0.179135,-0.271289,-0.194355,-0.021555,1.128739,1.448435,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
1,0.866399,-0.212057,3.303453,-1.671915,-0.179135,-0.271289,-0.194355,-0.021555,1.128739,-0.690400,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
2,-0.041466,-0.212057,-0.302714,0.598117,-0.179135,-0.271289,-0.194355,-0.021555,-0.438118,-0.690400,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
3,1.093365,-0.212057,-0.302714,0.598117,-0.179135,-0.271289,-0.194355,-0.021555,-1.221546,-0.690400,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
4,-0.798019,-0.212057,-0.302714,0.598117,-0.179135,-0.271289,-0.194355,-0.021555,1.128739,-0.690400,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45216,-0.419742,-0.212057,-0.302714,0.598117,-0.179135,-0.271289,-0.194355,-0.021555,1.128739,1.448435,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
45217,0.034190,-0.212057,-0.302714,0.598117,-0.179135,-0.271289,-0.194355,-0.021555,1.128739,-0.690400,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
45218,-0.041466,-0.212057,-0.302714,0.598117,-0.179135,-0.271289,-0.194355,-0.021555,1.128739,-0.690400,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955
45219,0.412467,-0.212057,-0.302714,0.598117,-0.179135,-0.271289,-0.194355,-0.021555,1.128739,-0.690400,...,-0.031561,-0.022062,-0.021035,-0.023985,-0.032935,-0.032597,-0.042881,-0.024891,-0.028226,-0.019955


In [32]:
final_X = X[['age', 'workclass_6','educationno', 'maritalstatus_1', 'maritalstatus_2','maritalstatus_3',
        'occupation_1','occupation_2','occupation_3','occupation_4',
       'occupation_5',
       'relationship_1','relationship_2','relationship_3','relationship_4','relationship_5',
       'sex_1','sex_2', 'capitalgain',
       'capitalloss', 'hoursperweek']]

In [33]:
final_X

Unnamed: 0,age,workclass_6,educationno,maritalstatus_1,maritalstatus_2,maritalstatus_3,occupation_1,occupation_2,occupation_3,occupation_4,...,relationship_1,relationship_2,relationship_3,relationship_4,relationship_5,sex_1,sex_2,capitalgain,capitalloss,hoursperweek
0,0.034190,-0.194355,1.128739,1.448435,-0.933416,-0.402215,2.676310,-0.390524,-0.217689,-0.391426,...,1.692449,-0.838402,-0.220185,-0.414343,-0.344119,0.693790,-0.693790,0.142884,-0.218728,-0.078121
1,0.866399,-0.194355,1.128739,-0.690400,1.071334,-0.402215,-0.373649,2.560661,-0.217689,-0.391426,...,-0.590860,1.192745,-0.220185,-0.414343,-0.344119,0.693790,-0.693790,-0.146735,-0.218728,-2.326714
2,-0.041466,-0.194355,-0.438118,-0.690400,-0.933416,2.486233,-0.373649,-0.390524,4.593708,-0.391426,...,1.692449,-0.838402,-0.220185,-0.414343,-0.344119,0.693790,-0.693790,-0.146735,-0.218728,-0.078121
3,1.093365,-0.194355,-1.221546,-0.690400,1.071334,-0.402215,-0.373649,-0.390524,4.593708,-0.391426,...,-0.590860,1.192745,-0.220185,-0.414343,-0.344119,0.693790,-0.693790,-0.146735,-0.218728,-0.078121
4,-0.798019,-0.194355,1.128739,-0.690400,1.071334,-0.402215,-0.373649,-0.390524,-0.217689,2.554760,...,-0.590860,-0.838402,4.541640,-0.414343,-0.344119,-1.441359,1.441359,-0.146735,-0.218728,-0.078121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45216,-0.419742,-0.194355,1.128739,1.448435,-0.933416,-0.402215,-0.373649,-0.390524,-0.217689,2.554760,...,-0.590860,-0.838402,-0.220185,2.413458,-0.344119,0.693790,-0.693790,-0.146735,-0.218728,-0.078121
45217,0.034190,-0.194355,1.128739,-0.690400,-0.933416,2.486233,-0.373649,-0.390524,-0.217689,2.554760,...,1.692449,-0.838402,-0.220185,-0.414343,-0.344119,-1.441359,1.441359,-0.146735,-0.218728,-0.411246
45218,-0.041466,-0.194355,1.128739,-0.690400,1.071334,-0.402215,-0.373649,-0.390524,-0.217689,2.554760,...,-0.590860,1.192745,-0.220185,-0.414343,-0.344119,0.693790,-0.693790,-0.146735,-0.218728,0.754691
45219,0.412467,-0.194355,1.128739,-0.690400,-0.933416,2.486233,2.676310,-0.390524,-0.217689,-0.391426,...,-0.590860,-0.838402,-0.220185,2.413458,-0.344119,0.693790,-0.693790,0.579976,-0.218728,-0.078121


In [34]:
y

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
15055     <=50K
15056     <=50K
15057     <=50K
15058     <=50K
15059      >50K
Name: Salary, Length: 45221, dtype: object

# Splitting dataset into training set and testing

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [36]:
from sklearn.svm import SVC
from sklearn import metrics
svc=SVC() #Default hyperparameters
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.8538828940385635


In [37]:
svc=SVC(kernel='rbf')
svc.fit(X_train,y_train)  #train on rbf kernal
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.8538828940385635


In [38]:
svc=SVC(kernel='linear')
svc.fit(X_train,y_train)    #train on linear kernal gives good accuracy than rbf
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.8515832301432867


In [39]:
svc=SVC(kernel='poly')
svc.fit(X_train,y_train)        #train on poly kernal
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

Accuracy Score:
0.8360162745444897


# CV on Linear kernel

In [None]:
#Performing K-fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score
svc=SVC(kernel='linear')
scores = cross_val_score(svc, X, y, cv=6, scoring='accuracy') #cv is cross validation
print(scores)

In [None]:
print(scores.mean())

In [None]:
#Performing K-fold cross validation on rbf kernal

In [None]:
from sklearn.model_selection import cross_val_score
svc=SVC(kernel='rbf')
scores = cross_val_score(svc, X, y, cv=10, scoring='accuracy') #cv is cross validation
print(scores)

In [None]:
print(scores.mean())

In [None]:
#it gives worst accuracy than linear
#So without finding an gamma value move to taking hyperparameter of class

In [None]:
svc=SVC(kernel='linear', C= 1)
svc.fit(X_train,y_train)    #train on linear kernal gives ggod accuracy than rbf
y_pred=svc.predict(X_test)
print('Accuracy Score:')
print(metrics.accuracy_score(y_test,y_pred))

In [None]:
#So as we see we come up with linear kernal with c=1 shows best accuracy i.e, 85%