### Importing Libraries

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### Reading Data

In [None]:
# Training dataset

In [None]:
data = pd.read_csv('adult.data',sep=",")
data.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capitial-gain','capitial-loss','hours-per-week','native-country','income']
# data

In [None]:
# Test dataset

In [None]:
test = pd.read_csv('adult.test',sep=",")
test.columns = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capitial-gain','capitial-loss','hours-per-week','native-country','income']
test

In [None]:
# columns and their data type

In [None]:
data.dtypes

In [None]:
# shape of data

In [None]:
data.shape

### Getting unique values from each column

In [None]:
# unique values in marital-status

In [None]:
data['marital-status'].unique()

In [None]:
# unique value in income

In [None]:
data['income'].unique()

In [None]:
# unique value in education

In [None]:
data['education'].unique()

In [None]:
# droping null and duplicate rows, if any

In [None]:
data = data.dropna(how='any',axis=0)
data = data.drop_duplicates()
# data

In [None]:
# cheching for any remaining values

In [None]:
data.isnull().values.any()

In [None]:
# checking for null values in each column

In [None]:
data.isnull().sum()

In [None]:
# frequency of values in workclass

In [None]:
data['workclass'].value_counts()

In [None]:
# frequency of values in occupation

In [None]:
data['occupation'].value_counts()

In [None]:
# frequency of values in native-country

In [None]:
data['native-country'].value_counts()

In [None]:
# frequency of values in marital-status

In [None]:
data['marital-status'].value_counts()

In [None]:
# frequency of values in sex

In [None]:
data['sex'].value_counts()

In [None]:
# frequency of values in race

In [None]:
data['race'].value_counts()

In [None]:
# frequency of values in income

In [None]:
data['income'].value_counts()

In [None]:
# frequency of values in education

In [None]:
data['education'].value_counts()

### Replacing ? values

In [None]:
data['workclass'] = data['workclass'].replace(' ?', 'Private')
data['occupation'] = data['occupation'].replace(' ?', 'Prof-specialty')
data['native-country'] = data['native-country'].replace(' ?', 'United-States')
# data

In [None]:
test['workclass'] = test['workclass'].replace(' ?', 'Private')
test['occupation'] = test['occupation'].replace(' ?', 'Prof-specialty')
test['native-country'] = test['native-country'].replace(' ?', 'United-States')

### Normalization

In [None]:
# Normalizing values in education

In [None]:
data.education = data.education.replace([' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th',' 10th', ' 11th', ' 12th'], 'school')
data.education = data.education.replace(' HS-grad', 'high school')
data.education = data.education.replace([' Assoc-voc', ' Assoc-acdm', ' Prof-school', ' Some-college'], 'higher')
data.education = data.education.replace(' Bachelors', 'undergrad')
data.education = data.education.replace(' Masters', 'grad')
data.education = data.education.replace(' Doctorate', 'doc')

In [None]:
test.education = test.education.replace([' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th',' 10th', ' 11th', ' 12th'], 'school')
test.education = test.education.replace(' HS-grad', 'high school')
test.education = test.education.replace([' Assoc-voc', ' Assoc-acdm', ' Prof-school', ' Some-college'], 'higher')
test.education = test.education.replace(' Bachelors', 'undergrad')
test.education = test.education.replace(' Masters', 'grad')
test.education = test.education.replace(' Doctorate', 'doc')

In [None]:
# Normalizing values in marital-status

In [None]:
data['marital-status']= data['marital-status'].replace([' Married-civ-spouse', ' Married-AF-spouse'], 'married')
data['marital-status']= data['marital-status'].replace([' Never-married'], 'not-married')
data['marital-status']= data['marital-status'].replace([' Divorced', ' Separated',' Widowed', ' Married-spouse-absent'], 'other')

In [None]:
test['marital-status']= test['marital-status'].replace([' Married-civ-spouse', ' Married-AF-spouse'], 'married')
test['marital-status']= test['marital-status'].replace([' Never-married'], 'not-married')
test['marital-status']= test['marital-status'].replace([' Divorced', ' Separated',' Widowed', ' Married-spouse-absent'], 'other')

In [None]:
# Normalizing values in income

In [None]:
data.income = data.income.replace(' <=50K', 0)
data.income = data.income.replace(' >50K', 1)

In [None]:
test.income = test.income.replace(' <=50K', 0)
test.income = test.income.replace(' >50K', 1)

In [None]:
# Now checking values in marital-status after normalization

In [None]:
data['marital-status'].value_counts()

In [None]:
# Now checking values in education after normalization

In [None]:
data['education'].value_counts()

In [None]:
# Now checking values in occupation after removing ? values

In [None]:
data['occupation'].value_counts()

### Getting data ready for model 

In [None]:
# storing all the values other than income in x
# storing the income column in y

In [None]:
x_train = data.drop(['income'],axis=1)
y_train = data['income']

In [None]:
x_test = test.drop(['income'],axis=1)
y_test = test['income']

In [None]:
# duplicating the current dataset and transforming it into numeric values

In [None]:
data1= data.copy()
data1= data1.apply(LabelEncoder().fit_transform)
data1.head()

In [None]:
test1= test.copy()
test1= test1.apply(LabelEncoder().fit_transform)
test1.head()

In [None]:
# scalling the data 

In [None]:
ss= StandardScaler().fit(data1.drop('income', axis=1))
x_train = ss.transform(data1.drop('income', axis=1))
y_train = data1['income']
# print(x)

In [None]:
x_test = ss.transform(test1.drop('income', axis=1))
y_test = test1['income']

## Logistic Regression

In [None]:
lr_model = LogisticRegression()

In [None]:
model = lr_model.fit(x_train, y_train)

In [None]:
# Predicting the test class

In [None]:
prediction_lr = model.predict(x_test)

In [None]:
print("Acc on training data: {:,.3f}".format(lr_model.score(x_train, y_train)))
print("Acc on test data: {:,.3f}".format(lr_model.score(x_test, y_test)))

In [None]:
confusion_matrix(y_test, prediction_lr)

In [None]:
# Precision tp/(tp+fp)
print('Precision =' , 11483/(11483+1816))

In [None]:
# recall= tp/tp+fn
print('Recall =', 11483/(11483+951))

## knn algorithm

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(x_train, y_train)

In [None]:
prediction_knn = knn.predict(x_test)
prediction_knn

In [None]:
print("Acc on training data: {:,.3f}".format(knn.score(x_train, y_train)))
print("Acc on test data: {:,.3f}".format(knn.score(x_test, y_test)))

In [None]:
confusion_matrix(y_test, prediction_knn)

In [None]:
# Precision tp/(tp+fp)
print('Precision =' , 11395/(11395+2433))

In [None]:
# recall= tp/tp+fn
print('Recall =',  11395/(11395+1039))