#Census Income Project

In [1]:
# importing the libraries

import pandas as pd
import numpy as np


In [2]:
# import the dataset

df = pd.read_csv("census-income .csv")

In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,annual_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
# basic info from dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  annual_income   32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


**Data Preprocessing**


1.   checking null value
2.   checking duplicates
3.   dealt with '?'
4.   statistical analysis of data
5.   Encoding of categorical columns


In [5]:
# check null values

df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
annual_income     0
dtype: int64

In [6]:
df.duplicated().sum()

24

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.duplicated().sum()

0

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   workclass       32537 non-null  object
 2   fnlwgt          32537 non-null  int64 
 3   education       32537 non-null  object
 4   education-num   32537 non-null  int64 
 5   marital-status  32537 non-null  object
 6   occupation      32537 non-null  object
 7   relationship    32537 non-null  object
 8   race            32537 non-null  object
 9   sex             32537 non-null  object
 10  capital-gain    32537 non-null  int64 
 11  capital-loss    32537 non-null  int64 
 12  hours-per-week  32537 non-null  int64 
 13  native-country  32537 non-null  object
 14  annual_income   32537 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB


In [14]:
#checking unique values in workclass column

df['workclass'].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', nan, 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [15]:
df['education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [16]:
df= df.replace('?', np.nan)

In [17]:
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     582
annual_income        0
dtype: int64

In [18]:
df= df.dropna()  #will drop the rows containing null values

In [19]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
annual_income     0
dtype: int64

In [20]:
#300 total ->  50 null

# we will use imputers

#replace with mean / median / mode

#numerical: ->    mean(normally dist) /median(skewed)
#object:    ->    mode (highest freq value)

**statistical analysis of numeric columns**

In [21]:
df.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,30139.0,30139.0,30139.0,30139.0,30139.0,30139.0
mean,38.44172,189795.0,10.122532,1092.841202,88.439928,40.934703
std,13.131426,105658.6,2.548738,7409.110596,404.445239,11.978753
min,17.0,13769.0,1.0,0.0,0.0,1.0
25%,28.0,117627.5,9.0,0.0,0.0,40.0
50%,37.0,178417.0,10.0,0.0,0.0,40.0
75%,47.0,237604.5,13.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


**statistical analysis of object data type columns**

In [22]:
df.describe(include='O')

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,annual_income
count,30139,30139,30139,30139,30139,30139,30139,30139,30139
unique,7,16,7,14,6,5,2,41,2
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K
freq,22264,9834,14059,4034,12457,25912,20366,27487,22633


In [23]:
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
workclass,30139,7,Private,22264
education,30139,16,HS-grad,9834
marital-status,30139,7,Married-civ-spouse,14059
occupation,30139,14,Prof-specialty,4034
relationship,30139,6,Husband,12457
race,30139,5,White,25912
sex,30139,2,Male,20366
native-country,30139,41,United-States,27487
annual_income,30139,2,<=50K,22633


**Label Encoding**

In [24]:
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()
temp=[]

for col in df.columns:
  if df[col].dtype == 'object':
    df[col]= le.fit_transform(df[col])
    # o_l=le.classes_
    # for label,value in enumerate(o_l):
    #   print(label,value)


In [25]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,annual_income
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30139 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             30139 non-null  int64
 1   workclass       30139 non-null  int32
 2   fnlwgt          30139 non-null  int64
 3   education       30139 non-null  int32
 4   education-num   30139 non-null  int64
 5   marital-status  30139 non-null  int32
 6   occupation      30139 non-null  int32
 7   relationship    30139 non-null  int32
 8   race            30139 non-null  int32
 9   sex             30139 non-null  int32
 10  capital-gain    30139 non-null  int64
 11  capital-loss    30139 non-null  int64
 12  hours-per-week  30139 non-null  int64
 13  native-country  30139 non-null  int32
 14  annual_income   30139 non-null  int32
dtypes: int32(9), int64(6)
memory usage: 2.6 MB


In [28]:
# o_l = le.classes_
# for label,value in enumerate(o_l):
#   print(label,value)

**Separate Independent and Dependent Features**

In [29]:
X = df.iloc[:,:-1]  #ind features
Y = df.iloc[:, -1]  #target column

In [30]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [31]:
x_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
473,30,2,303990,11,9,4,13,1,4,1,0,0,60,38
5925,57,4,437281,15,10,2,12,0,4,1,0,0,38,38
7098,32,2,182975,15,10,4,12,1,4,0,0,0,20,38
9692,37,2,330826,8,11,2,9,5,4,0,0,0,30,38
6530,40,2,321758,11,9,2,11,0,4,1,0,0,40,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14174,22,0,209131,7,12,4,7,3,4,1,0,0,20,38
21214,47,0,44257,9,13,3,3,1,4,1,0,0,60,38
10650,30,2,291951,11,9,2,2,0,2,1,0,0,40,38
11693,51,5,167065,11,9,2,3,0,4,1,0,0,40,38


In [32]:
y_train

473      0
5925     1
7098     0
9692     0
6530     1
        ..
14174    0
21214    1
10650    0
11693    0
2976     0
Name: annual_income, Length: 21097, dtype: int32

In [33]:
x_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
5161,32,2,194987,11,9,2,0,5,2,0,0,0,40,38
32031,35,2,195516,5,4,2,12,0,4,1,0,0,40,25
17545,32,2,168906,7,12,4,0,1,4,0,0,0,40,38
3383,40,2,327573,8,11,2,2,0,4,1,0,0,40,38
22961,46,4,140121,11,9,0,2,3,4,1,0,0,50,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13559,25,2,193701,9,13,4,7,1,4,0,0,0,38,38
16508,51,5,105943,11,9,2,0,5,4,0,3908,0,40,38
9287,42,2,179524,9,13,5,7,1,4,0,0,0,50,38
8843,46,2,231515,7,12,4,0,1,4,0,0,0,47,4


In [34]:
y_test

5161     0
32031    0
17545    0
3383     0
22961    0
        ..
13559    0
16508    0
9287     0
8843     0
7736     0
Name: annual_income, Length: 9042, dtype: int32

**Standardization**

In [35]:
from sklearn.preprocessing import StandardScaler

sc= StandardScaler()

x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [36]:
x_train

array([[-0.64300854, -0.21235596,  1.07265763, ..., -0.21833557,
         1.60003861,  0.26243854],
       [ 1.41560108,  1.89294181,  2.33155256, ..., -0.21833557,
        -0.23685982,  0.26243854],
       [-0.49051894, -0.21235596, -0.07029403, ..., -0.21833557,
        -1.73977672,  0.26243854],
       ...,
       [-0.64300854, -0.21235596,  0.95895276, ..., -0.21833557,
        -0.06986906,  0.26243854],
       [ 0.95813227,  2.94559069, -0.22055938, ..., -0.21833557,
        -0.06986906,  0.26243854],
       [-0.33802934, -0.21235596, -0.07783091, ..., -0.21833557,
        -0.48734597,  0.26243854]])

In [37]:
x_test

array([[-0.49051894, -0.21235596,  0.04315583, ..., -0.21833557,
        -0.06986906,  0.26243854],
       [-0.26178454, -0.21235596,  0.04815208, ..., -0.21833557,
        -0.06986906, -1.90540241],
       [-0.49051894, -0.21235596, -0.20317167, ..., -0.21833557,
        -0.06986906,  0.26243854],
       ...,
       [ 0.27192907, -0.21235596, -0.10288773, ..., -0.21833557,
         0.76508478,  0.26243854],
       [ 0.57690827, -0.21235596,  0.38815222, ..., -0.21833557,
         0.51459863, -5.40729933],
       [-0.26178454, -0.21235596,  0.48806782, ..., -0.21833557,
        -0.06986906,  0.26243854]])

**Model Building**

In [38]:
from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression()
model1.fit(x_train, y_train)

In [39]:
y_pred1= model1.predict(x_test)

In [40]:
y_pred1

array([0, 0, 0, ..., 0, 0, 0])

In [41]:
from sklearn.metrics import *

accuracy1 = accuracy_score(y_test, y_pred1)
accuracy1*100

81.79606281796062

In [42]:
confusion_matrix(y_test, y_pred1)

array([[6336,  456],
       [1190, 1060]], dtype=int64)

In [43]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.84      0.93      0.89      6792
           1       0.70      0.47      0.56      2250

    accuracy                           0.82      9042
   macro avg       0.77      0.70      0.72      9042
weighted avg       0.81      0.82      0.80      9042



**Model Building -2**

In [44]:
from sklearn.ensemble import RandomForestClassifier

model2= RandomForestClassifier()
model2.fit(x_train, y_train)

In [45]:
y_pred2 = model2.predict(x_test)

In [46]:
accuracy2 = accuracy_score(y_test, y_pred2)
accuracy2*100

85.05861535058615

In [47]:
confusion_matrix(y_test, y_pred2)

array([[6281,  511],
       [ 840, 1410]], dtype=int64)

In [48]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6792
           1       0.73      0.63      0.68      2250

    accuracy                           0.85      9042
   macro avg       0.81      0.78      0.79      9042
weighted avg       0.85      0.85      0.85      9042



**Model Building -3**

In [49]:
from sklearn.tree import DecisionTreeClassifier

model3 = DecisionTreeClassifier()
model3.fit(x_train, y_train)

In [50]:
y_pred3 = model3.predict(x_test)

In [51]:
accuracy3 = accuracy_score(y_test, y_pred3)
accuracy3*100

80.35832780358328