In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('adult.csv')
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'Target']
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Target
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,?,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [3]:
data.shape

(32560, 15)

In [4]:
data.isna().sum().sum()

0

In [5]:
(data.values == '?').sum()

17

In [6]:
data = data.replace('?', np.nan)
data.dropna(inplace=True)

In [7]:
data.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Target            0
dtype: int64

In [8]:
data.shape

(32543, 15)

In [9]:
# data['Target'].unique()
data['marital-status'].unique()
# data['sex'].unique()

array([' Married-civ-spouse', ' Never-married', ' Divorced', ' Separated',
       ' Married-AF-spouse', ' Married-spouse-absent', ' Widowed'],
      dtype=object)

In [10]:
data = data.replace(' <=50K', 1)
data = data.replace(' >50K', 2)

data = data.replace(' Male', 1)
data = data.replace(' Female', 2)

In [11]:
data['Target'].unique()

array([1, 2], dtype=int64)

In [12]:
data_t = data[['age', 'fnlwgt', 'sex', 'hours-per-week', 'Target']]
data_t.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,50,83311,1,13,1
2,53,234721,1,40,1
3,28,338409,2,40,1
4,37,284582,2,40,1
6,52,209642,1,45,2


## **Error correcting (Outlier Detection and Removal)**



In [13]:
data_t = data_t.astype(float)
data_t.shape

(32543, 5)

In [14]:
def remove_outliers_zscore(data_t, threshold=3):
  zscore = np.abs((data_t - data_t.mean()) / data_t.std())
  out = zscore > 3
  data_t = data_t[~out.any(axis=1)]

  return data_t

filtered_data = remove_outliers_zscore(data_t)

In [15]:
filtered_data.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,50.0,83311.0,1.0,13.0,1.0
2,53.0,234721.0,1.0,40.0,1.0
3,28.0,338409.0,2.0,40.0,1.0
4,37.0,284582.0,2.0,40.0,1.0
6,52.0,209642.0,1.0,45.0,2.0


In [16]:
filtered_data.shape

(31654, 5)

In [17]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [18]:
filtered_data.loc[filtered_data['Target'] == 1, 'Target'] = 'Y'
filtered_data.loc[filtered_data['Target'] == 2, 'Target'] = 'N'

le = LabelEncoder()
le.fit(filtered_data['Target'])
le.classes_

  filtered_data.loc[filtered_data['Target'] == 1, 'Target'] = 'Y'


array(['N', 'Y'], dtype=object)

In [19]:
filtered_data['Target'] = le.transform(filtered_data['Target'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Target'] = le.transform(filtered_data['Target'])


In [20]:
filtered_data.head()

Unnamed: 0,age,fnlwgt,sex,hours-per-week,Target
0,50.0,83311.0,1.0,13.0,1
2,53.0,234721.0,1.0,40.0,1
3,28.0,338409.0,2.0,40.0,1
4,37.0,284582.0,2.0,40.0,1
6,52.0,209642.0,1.0,45.0,0


In [21]:
filtered_data['Target'].unique()
data1 = filtered_data

In [22]:
X = data1.drop('Target', axis=1)
y = data1['Target']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

In [24]:
y_pred_reg = reg.predict(x_test)
print('Accuracy - Logistic Regression : ', accuracy_score(y_test, y_pred_reg))

Accuracy - Logistic Regression :  0.7690849742023796


In [25]:
navi = GaussianNB()
navi.fit(x_train, y_train)

In [26]:
y_pred_navi = navi.predict(x_test)
print('Accuracy - Naive Bayes : ', accuracy_score(y_test, y_pred_navi))

Accuracy - Naive Bayes :  0.7744550910813941


In [27]:

print('Accuracy - Logistic Regression : ', accuracy_score(y_test, y_pred_reg))
print('Accuracy - Naive Bayes : ', accuracy_score(y_test, y_pred_navi))

Accuracy - Logistic Regression :  0.7690849742023796
Accuracy - Naive Bayes :  0.7744550910813941
