In [1]:
import numpy as np
import pandas as pd
import io
import requests
import seaborn as sns
from matplotlib import pyplot as plt
import pickle
import os
from pandas.api.types import CategoricalDtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [22]:
def load_dataset(path, urls):
    if not os.path.exists(path):
        os.mkdir(path)

    for url in urls:
        data = requests.get(url).content
        filename = os.path.join(path, os.path.basename(url))
        with open(filename, "wb") as file:
            file.write(data)

We will create a data folder in the current working directory and store the content of the URLs.

In [21]:
urls = ["http://archive.ics.uci.edu/ml/machine-learning-  databases/adult/adult.data",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"]
load_dataset('data', urls)

Next, we load the data into a pandas dataframe using the read_csv function.

In [6]:
columns = ["age", "workClass", "fnlwgt", "education", "education-num","marital-status", "occupation", "relationship",
          "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
train_data = pd.read_csv('data/adult.data', names=columns, 
             sep=' *, *', na_values='?')
test_data  = pd.read_csv('data/adult.test', names=columns, 
             sep=' *, *', skiprows=1, na_values='?')

  train_data = pd.read_csv('data/adult.data', names=columns,
  test_data  = pd.read_csv('data/adult.test', names=columns,


There are some whitespaces before and after the data values. To trim all the whitespaces we use the separator ‘ *, *’. The test dataset has a weird first line, hence we skip the line using skiprows=1. The missing values in the dataset are indicated by ?
Next, we will explore the data. This is an important step before going building the model.

Exploratory Data Analysis

In [None]:
RangeIndex: 32561 entries
Data columns      (total 15 columns)
age               32561 non-null int64 
workClass         30725 non-null object 
fnlwgt            32561 non-null int64 
education         32561 non-null object 
education-num     32561 non-null int64 
marital-status    32561 non-null object 
occupation        30718 non-null object 
relationship      32561 non-null object 
race              32561 non-null object 
sex               32561 non-null object 
capital-gain      32561 non-null int64 
capital-loss      32561 non-null int64 
hours-per-week    32561 non-null int64 
native-country    31978 non-null object 
income            32561 non-null object

Observations
There are 32561 samples in the training dataset
There are both categorical and numerical columns in the dataset
The columns workClass, occupation, native-country have missing values
Similarly, for the test dataset
There are 16281 samples
There are no missing values
Let’s look the numerical and the categorical data with the help of some visualizations.
Handling Numerical Columns
We select the numerical columns using the select_dtypes function.

In [17]:
num_attributes = train_data.select_dtypes(include=['int'])
print(num_attributes.columns)
['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',        'hours-per-week']

Index([], dtype='object')


['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week']

The variables age, hours-per-week are self-explanatory.
fnlwgt: sampling weight
education-num: number of years of education in total
capital-gain/capital-loss: income from investment sources other than salary/wages
fnlwgt is not related to the target variable income and will be removed before building the model

# Data Visualizations

In [None]:
num_attributes.hist(figsize=(10,10))

In [19]:
train_data.describe()

Unnamed: 0,workClass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,
max,,,,,,,,,,,,,,


Observations
None of the numerical attributes have missing values
The values are on different scales. Many machine learning models require the values to be on the same scale. We will use StandardScaler from the sklearn library to scale the features.