In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#read the given csv file
df = pd.read_csv('adult.csv')
print(df.head())

   39          State-gov   77516   Bachelors   13        Never-married  \
0  50   Self-emp-not-inc   83311   Bachelors   13   Married-civ-spouse   
1  38            Private  215646     HS-grad    9             Divorced   
2  53            Private  234721        11th    7   Married-civ-spouse   
3  28            Private  338409   Bachelors   13   Married-civ-spouse   
4  37            Private  284582     Masters   14   Married-civ-spouse   

         Adm-clerical   Not-in-family   White     Male   2174   0   40  \
0     Exec-managerial         Husband   White     Male      0   0   13   
1   Handlers-cleaners   Not-in-family   White     Male      0   0   40   
2   Handlers-cleaners         Husband   Black     Male      0   0   40   
3      Prof-specialty            Wife   Black   Female      0   0   40   
4     Exec-managerial            Wife   White   Female      0   0   40   

    United-States   <=50K  
0   United-States   <=50K  
1   United-States   <=50K  
2   United-States   <=50K 

### 1)Rename the columns.

In [3]:
df.columns = [' Age','Workclass','Fnlwgt','Education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hourse_per_week','native_country','income']
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hourse_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### 2)Handling the null value

In [4]:
d1 = df.isnull().sum()
d1

 Age               0
Workclass          0
Fnlwgt             0
Education          0
education_num      0
marital_status     0
occupation         0
relationship       0
race               0
sex                0
capital_gain       0
capital_loss       0
hourse_per_week    0
native_country     0
income             0
dtype: int64

<b> In this data set there is no null value</b>

In [5]:
df.dtypes

 Age                int64
Workclass          object
Fnlwgt              int64
Education          object
education_num       int64
marital_status     object
occupation         object
relationship       object
race               object
sex                object
capital_gain        int64
capital_loss        int64
hourse_per_week     int64
native_country     object
income             object
dtype: object

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
lb = LabelEncoder()
df['Workclass'] = lb.fit_transform(df['Workclass'])
df['Education'] = lb.fit_transform(df['Education'])
df['marital_status'] = lb.fit_transform(df['marital_status'])
df['occupation'] = lb.fit_transform(df['occupation'])
df['relationship'] = lb.fit_transform(df['relationship'])
df['race'] = lb.fit_transform(df['race'])
df['sex'] = lb.fit_transform(df['sex'])
df['native_country'] = lb.fit_transform(df['native_country'])


In [8]:
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hourse_per_week,native_country,income
0,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,<=50K
1,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,<=50K
2,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,<=50K
3,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,<=50K
4,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,<=50K


### 3)Split data into training and test data.

In [9]:
x = df.iloc[:,:-1]  # independent features 
y = df.iloc[:,-1]   # dependent feature
print(type(x))
print(type(y))
print(y)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32555     <=50K
32556      >50K
32557     <=50K
32558     <=50K
32559      >50K
Name: income, Length: 32560, dtype: object


In [10]:
print(x.shape)
print(y.shape)

(32560, 14)
(32560,)


In [11]:
print(x.head())

    Age  Workclass  Fnlwgt  Education  education_num  marital_status  \
0    50          6   83311          9             13               2   
1    38          4  215646         11              9               0   
2    53          4  234721          1              7               2   
3    28          4  338409          9             13               2   
4    37          4  284582         12             14               2   

   occupation  relationship  race  sex  capital_gain  capital_loss  \
0           4             0     4    1             0             0   
1           6             1     4    1             0             0   
2           6             0     2    1             0             0   
3          10             5     2    0             0             0   
4           4             5     4    0             0             0   

   hourse_per_week  native_country  
0               13              39  
1               40              39  
2               40              39 

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(26048, 14)
(6512, 14)
(26048,)
(6512,)


<h2>4) Applying the classifiers</h2>
       <b>DecisionTree classifiers</b>,
       <b>RandomForest classifiers</b>,
       <b>KNN classifiers</b>,
       <b>Logestic Regression classifiers</b>,
       <b>SVM classifiers(with linear kernal)</b>

In [14]:
#predict the 

def apply_model(model,x_train,x_test,y_train,y_test):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25)
    model.fit(x_train,y_train)
    print("training score",model.score(x_train,y_train))
    print("test score",model.score(x_test,y_test))
    y_pred = model.predict(x_test)
    print(y_pred )
    cm_model = confusion_matrix(y_test,y_pred)
    print(cm_model)
    print(classification_report(y_test,y_pred))



In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report

#### DecissionnTreeClassifier

In [18]:
m1 = DecisionTreeClassifier(criterion='gini',max_depth=10,min_samples_split=12)
apply_model(m1,x_train,x_test,y_train,y_test)

training score 0.8666257166257166
test score 0.8544226044226044
[' >50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']
[[5680  500]
 [ 685 1275]]
              precision    recall  f1-score   support

       <=50K       0.89      0.92      0.91      6180
        >50K       0.72      0.65      0.68      1960

    accuracy                           0.85      8140
   macro avg       0.81      0.78      0.79      8140
weighted avg       0.85      0.85      0.85      8140



<b>MISCLASSIFICATION PERCENTAGE</b>

In [20]:
per1 = (407+777)/(5786+407+777+1170)
per1 = (per1*100)
per1

14.545454545454545

In [21]:
m1.fit(x_train,y_train)
ypred_m1 = m1.predict(x_test)
print(ypred_m1)

[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


<b>RandomForest</b>

In [22]:
# Random Forest
m2 = RandomForestClassifier(n_estimators=17,criterion='entropy',max_depth=10,min_samples_split=12)
apply_model(m2,x_train,x_test,y_train,y_test)

training score 0.8635544635544635
test score 0.8531941031941032
[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' <=50K']
[[5874  286]
 [ 909 1071]]
              precision    recall  f1-score   support

       <=50K       0.87      0.95      0.91      6160
        >50K       0.79      0.54      0.64      1980

    accuracy                           0.85      8140
   macro avg       0.83      0.75      0.77      8140
weighted avg       0.85      0.85      0.84      8140



<b>MISCLASSIFICATION PERCENTAGE</b>

In [24]:
per2 = (269+920)/(5871+269+920+1080)
per2 = (per2*100)
per2

14.606879606879609

In [25]:
m2.fit(x_train,y_train)
ypred_m2 = m2.predict(x_test)
print(ypred_m2)

[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


<b>Logistic Regression</b>

In [26]:
# Logistic Regression
m3 = LogisticRegression(solver='liblinear')
apply_model(m3,x_train,x_test,y_train,y_test)

training score 0.7918099918099918
test score 0.7922604422604422
[' <=50K' ' <=50K' ' >50K' ... ' <=50K' ' >50K' ' <=50K']
[[5894  322]
 [1369  555]]
              precision    recall  f1-score   support

       <=50K       0.81      0.95      0.87      6216
        >50K       0.63      0.29      0.40      1924

    accuracy                           0.79      8140
   macro avg       0.72      0.62      0.64      8140
weighted avg       0.77      0.79      0.76      8140



<b>MISSCLASIFICATION PERCENTAGE</b>

In [27]:
per3 = (287+1313)/(5960+287+1313+580)
per3 = (per3*100)
per3

19.656019656019655

In [28]:
m3.fit(x_train,y_train)
ypred_m3 = m3.predict(x_test)
print(ypred_m3)

[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


<b> KNN Classifier</b>

In [29]:
# KNN
m4 = KNeighborsClassifier(n_neighbors=19)
apply_model(m4,x_train,x_test,y_train,y_test)

training score 0.8053644553644553
test score 0.7943488943488943
[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' <=50K']
[[6031   98]
 [1576  435]]
              precision    recall  f1-score   support

       <=50K       0.79      0.98      0.88      6129
        >50K       0.82      0.22      0.34      2011

    accuracy                           0.79      8140
   macro avg       0.80      0.60      0.61      8140
weighted avg       0.80      0.79      0.75      8140



<b>MISCLASSIFICATION PERCENTAGE</b>

In [30]:
per4 = (123+1556)/(6048+123+1556+413)
per4 = (per4*100)
per4

20.626535626535627

In [31]:
m4.fit(x_train,y_train)
ypred_m4 = m4.predict(x_test)
print(ypred_m4)

[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


<b>SVC Classifier (with linear kernel)</b>

In [32]:
### SVM
m5 = SVC(kernel='linear',C=0.1)
apply_model(m5,x_train,x_test,y_train,y_test)

training score 0.7959459459459459
test score 0.7948402948402948
[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' <=50K']
[[5857  291]
 [1379  613]]
              precision    recall  f1-score   support

       <=50K       0.81      0.95      0.88      6148
        >50K       0.68      0.31      0.42      1992

    accuracy                           0.79      8140
   macro avg       0.74      0.63      0.65      8140
weighted avg       0.78      0.79      0.76      8140



m5.fit(x_train,y_train)
ypred_m5 = m5.predict(x_test)
print(ypred_m5)

In [None]:
<b>MISCLASSIFICATION PERCENTAGE</b>

In [2]:
per5 = (291+1379)/(5857+291+1379+613)
per5 = (per5*100)
pre5

0.20515970515970516