# Support Vector Machine


In [274]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix , classification_report

In [275]:
df = pd.read_csv("income_evaluation.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [276]:
df.isnull().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [277]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [278]:
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [279]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [280]:
np.unique(df[' income'])

array([' <=50K', ' >50K'], dtype=object)

In [281]:
df.drop(" fnlwgt", axis = 1, inplace = True)

In [282]:
# To Deal With Space

col_name = ['age', 'workclass', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income']

In [283]:
df.columns = col_name
df.columns

Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income'],
      dtype='object')

In [284]:
df.loc[df['workclass'] == ' ?', 'workclass'] = np.nan

In [285]:
df.loc[df['education'] == ' ?', 'education'] = np.nan
df.loc[df['occupation'] == ' ?', 'occupation'] = np.nan
df.loc[df['native-country'] == ' ?', 'native-country'] = np.nan


In [286]:
df.isnull().sum()
df.dropna(inplace=True)
df.isnull().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [287]:
df['income'] = df['income'].map({' <=50K': 0, ' >50K': 1})
df['income'].value_counts()

income
0    22633
1     7506
Name: count, dtype: int64

In [288]:
cat_col = df.select_dtypes(include='object')
cat_col.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [289]:
num_col = df.select_dtypes(exclude='object')
num_col.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income
0,39,13,2174,0,40,0
1,50,13,0,0,13,0
2,38,9,0,0,40,0
3,53,7,0,0,40,0
4,28,13,0,0,40,0


In [290]:
# label encoding 

# from sklearn.preprocessing import LabelEncoder

# def label_encode():
#     le = LabelEncoder()
#     df[a] = le.fit_transform(df[a])
  

In [291]:
# cat_col.columns
# label_list = ['workclass', 'education', 'marital-status', 'occupation',
#        'relationship', 'race', 'sex', 'native-country']

# for a in label_list:
#     label_encode()

# df.head()


In [292]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cat_col_encoded = cat_col.apply(le.fit_transform)
cat_col_encoded.head()


Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,5,9,4,0,1,4,1,38
1,4,9,2,3,0,4,1,38
2,2,11,0,5,1,4,1,38
3,2,1,2,5,0,2,1,38
4,2,9,2,9,5,2,0,4


In [293]:
final_df = pd.concat([num_col, cat_col_encoded], axis = 1)

In [294]:
x = final_df.drop('income', axis = 1)
y = final_df['income']

In [295]:
x_train , x_test , y_train , y_test = train_test_split(x,y, test_size= 0.2, random_state = 43)

In [296]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [297]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30139 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             30139 non-null  int64
 1   education-num   30139 non-null  int64
 2   capital-gain    30139 non-null  int64
 3   capital-loss    30139 non-null  int64
 4   hours-per-week  30139 non-null  int64
 5   income          30139 non-null  int64
 6   workclass       30139 non-null  int64
 7   education       30139 non-null  int64
 8   marital-status  30139 non-null  int64
 9   occupation      30139 non-null  int64
 10  relationship    30139 non-null  int64
 11  race            30139 non-null  int64
 12  sex             30139 non-null  int64
 13  native-country  30139 non-null  int64
dtypes: int64(14)
memory usage: 3.4 MB
