In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
df = pd.read_csv("income_evaluation.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [5]:
np.unique(df[' income'])

array([' <=50K', ' >50K'], dtype=object)

In [6]:
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces from column names
np.unique(df['income'])

array([' <=50K', ' >50K'], dtype=object)

In [7]:
df.drop("fnlwgt", axis=1, inplace=True, errors='ignore')

In [8]:
col_name=['age', ' workclass', ' education', ' education-num',
        ' marital-status', ' occupation', ' relationship', ' race', ' sex',
        ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
        ' income']
df.colums=col_name
df.columns

  df.colums=col_name


Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income'],
      dtype='object')

In [9]:
df.loc[df['workclass']=='?']

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income


In [10]:
df.loc[df['workclass']=='?','workclass'] = np.nan
df.loc[df["occupation"]=="?","occupation"] = np.nan
df.loc[df["native-country"]=="?","native-country"] = np.nan

In [11]:
df.isnull().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [12]:
df.dropna(inplace=True) #dropping null values

In [13]:
cat_col = df.select_dtypes(include="object")
cat_col.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [14]:
num_col = df.select_dtypes(exclude="object")
num_col.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,39,13,2174,0,40
1,50,13,0,0,13
2,38,9,0,0,40
3,53,7,0,0,40
4,28,13,0,0,40


In [15]:
#to change our income column datatype to int
df['income'] = df['income'].map({' <=50K':0, ' >50K':1})
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [16]:
df["income"].value_counts()

income
0    24720
1     7841
Name: count, dtype: int64

In [17]:
#balance income column with equal number of 0 and 1
df = df.groupby('income').apply(lambda x: x.sample(df.income.value_counts().min(), replace=False)).reset_index(drop=True)
df.income.value_counts()


  df = df.groupby('income').apply(lambda x: x.sample(df.income.value_counts().min(), replace=False)).reset_index(drop=True)


income
0    7841
1    7841
Name: count, dtype: int64

In [18]:
cat_col = df.select_dtypes(include="object")
cat_col.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,Private,HS-grad,Never-married,Other-service,Own-child,White,Male,Mexico
1,Private,12th,Never-married,Sales,Own-child,White,Male,United-States
2,Private,Assoc-voc,Divorced,Machine-op-inspct,Unmarried,White,Male,United-States
3,Private,Some-college,Widowed,Exec-managerial,Not-in-family,Amer-Indian-Eskimo,Female,United-States
4,Private,HS-grad,Separated,Other-service,Unmarried,Black,Female,United-States


In [19]:
num_col = df.select_dtypes(exclude="object")
num_col.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income
0,24,9,0,0,40,0
1,17,8,0,0,20,0
2,31,11,0,0,43,0
3,42,10,0,0,40,0
4,36,9,0,0,35,0


In [20]:
##label encoding 
# from sklearn.preprocessing import LabelEncoder
# def label_encoder(a):
#     le = LabelEncoder()
#     df[a] = le.fit_transform(df[a])

#label encoding on cat col 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_col_encoded = cat_col.apply(le.fit_transform)
cat_col_encoded

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,4,11,4,8,3,4,1,26
1,4,2,4,12,3,4,1,39
2,4,8,0,7,4,4,1,39
3,4,15,6,4,1,0,0,39
4,4,11,5,8,4,2,0,39
...,...,...,...,...,...,...,...,...
15677,4,9,2,10,0,4,1,21
15678,4,9,4,12,1,4,1,39
15679,4,9,4,1,1,4,1,39
15680,7,12,2,1,0,0,1,39


In [21]:
final_df = pd.concat([cat_col_encoded, num_col], axis=1)

In [22]:
##x and y 
x = final_df.drop("income",axis=1)
y = final_df["income"]

In [23]:
##train test split 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y,
                                                        test_size=0.2,
                                                        random_state=42)

In [24]:
##standard scaler -- normalise -- range 
##mean - 0 std - 1
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [25]:
#svc
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_scaled,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [26]:
svc.score(x_test_scaled,y_test)

0.8221230474976092

In [27]:
svc.score(x_train_scaled,y_train)

0.8286966919091271

In [28]:
## hyper parameter tuning 
from sklearn.model_selection import GridSearchCV
grid = {
    "C" : [0.01,0.1,1,10],
    "kernel" : ["linear","rbf","ploy","sigmoid"],
    "degree" : [1,3,5,7],
    "gamma" : [0.01,1]
}
svm = SVC()
svm_cv = GridSearchCV(svm,grid,cv=5)
svm_cv.fit(x_train_scaled,y_train)

KeyboardInterrupt: 

In [None]:
## svm_cv.best_params_
## svm_cv.best_score_
