First we will import the required libraries

In [13]:
#import all libraries
import pandas as pd
import numpy as np
import io
import requests
import seaborn as sns
from matplotlib import pyplot as plt 
import pickle
import os
from pandas.api.types import CategoricalDtype

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score

Downloading the data

In [14]:
# fetching the data into variables
df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/census_income.csv")
df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education_num,Marital_status,Occupation,Relationship,Race,Sex,Capital_gain,Capital_loss,Hours_per_week,Native_country,Income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


Downloading the certain data

In [15]:
def load_dataset(path, urls):
    if not os.path.exists(path):
        os.mkdir(path)
        
        for url in urls:
            data = requests.get(url).content
            filename = os.path.join(path, os.path.basename(url))
            with open(filename, "wb") as file:
                file.write(data)
        

We will create a data folders in the current working directory and store the content of urls

In [16]:
urls = ["http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"]

load_dataset('data', urls)

Next, we load the data into pandas dataframe using the read_csv function

In [17]:
columns = ["age", "workClass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", 
           "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

train_data = pd.read_csv('data/adult.data', names=columns, sep=' *, *', na_values='?')

test_data = pd.read_csv('data/adult.test', names=columns, sep=' *, *', skiprows=1, na_values='?')

  train_data = pd.read_csv('data/adult.data', names=columns, sep=' *, *', na_values='?')
  test_data = pd.read_csv('data/adult.test', names=columns, sep=' *, *', skiprows=1, na_values='?')


Next, we will explore teh data.. thsi is very important step going to build the model

Exploratory Data Analysis

In [18]:
#lets get into more info about training data using
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             9 non-null      object 
 1   workClass       0 non-null      float64
 2   fnlwgt          0 non-null      float64
 3   education       0 non-null      float64
 4   education-num   0 non-null      float64
 5   marital-status  0 non-null      float64
 6   occupation      0 non-null      float64
 7   relationship    0 non-null      float64
 8   race            0 non-null      float64
 9   sex             0 non-null      float64
 10  capital-gain    0 non-null      float64
 11  capital-loss    0 non-null      float64
 12  hours-per-week  0 non-null      float64
 13  native-country  0 non-null      float64
 14  income          0 non-null      float64
dtypes: float64(14), object(1)
memory usage: 1.2+ KB


Observation:

*there are 32561 samples in training dataset
*there are both categorial and numerical columns in data set
*the columns workclass, occupation, native-country have missing values

similarly for the best dataset,

*there are 16281 samples
*there are no missing values

lets look on to some numerical and categorial values to help some visualization

Handling Numerical columns

We select the numerical columns using the select_dtypes function

In [19]:
num_attributes = train_data.select_dtypes(include=['int'])
print (num_attributes.columns)

['age', 'fnlwgt', 'education-name', 'captal=gain', 'capital-loss', 'hours-per-week']

Index([], dtype='object')


['age',
 'fnlwgt',
 'education-name',
 'captal=gain',
 'capital-loss',
 'hours-per-week']

The variables age,hours-per-week are self-explanatory
*fnlwgt:sampling weight
*education-num: number of years of education in total
*capital-gain and capital-loss:income from investment sources other than salary/wages


Now, Data Visualization

In [20]:
num_attributes.hist(figsize=(10,10))

ValueError: hist method requires numerical columns, nothing to plot.

More information about the data can be gathered by using:

In [29]:
train_data.describe()

Unnamed: 0,workClass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,
max,,,,,,,,,,,,,,


Observations:

*None of the numerical attributes having missing values
*the values are in diff scales. many machine learning models require the values to be on same scale. 
we will use standardScaler from sklearn library to scale the features

Handling Categorial columns

In [26]:
cat_attributes = train_data.select_dtypes(include=['object'])
print(cat_attributes.columns)

['WorkClass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']

Index(['age'], dtype='object')


['WorkClass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country',
 'income']

Data Visualization:

we will use countplot from seaborn package.

In [27]:
sns.countplot(y='workclass', hue='income', data = cat_attributes)

ValueError: Could not interpret input 'workclass'

In [28]:
sns.countplot(y='occupation', hue='income', data = cat_attributes)

ValueError: Could not interpret input 'occupation'

Creating Pipelines:

we will using fit and transform now,
fit= should rteurn an instance of self
transform = the transformation logic can be added here

from columnSelector pipeline,


In [1]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def_init_(self, type):
        self.type = type
        
    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        return X.select_dtypes(include=[self.type])

SyntaxError: invalid syntax (<ipython-input-1-f13789253c36>, line 3)

Numerical data pipeline

We select the numerical attributes using the ColumnsSelector transformer defined above and then scale the values using StandardScaler

In [2]:
num_pipeline = Pipeline(steps=[("num_attr_selector", ColumnsSelector(type='int')),("scaler", StandardScaler())])

NameError: name 'Pipeline' is not defined

If we call the Fit and Transform methods for the num_pipeline it internally calls the fit and transform methods
for all transformers defined in the pipeline. in this case, the Columnsselector and StandardScaler transformers.

Categorial Data Pipeline

We need to replace missing values in categorial columns. sklearn comes with imputer to handle missing values.
However, Imputer works only with numerical values.

In [None]:
Class CategorialImputer(BaseEstimator, TransformerMixin):
    
    def_init_(self, columns = None, strategy='most_frequent'):
        self.columns = columns
        self.strategy = strategy
        
    def fit(self,X, y=None):
        if self.columns is None:
            self.columns = X.columns
            
        if self.strategy is 'most_frequent':
            self.fill = {column: X[column].value_counts().index[0] for column in self.columns}
            else:
                self.fill = {column: '0' for column in self.columns}
                
            retun self
            
        def transform(self,X):
            X_copy = X.copy()
            for column in self.columns:
                X_copy[column] = X_copy[column].fillna(self.fill[column])
            return X_copy

In [1]:
Class CategorialEncoder(BaseEstimator, TransformerMixin):
    
    def_init_(self, dropFirst=True):
        self.categories=dict()
        self.dropFirst=dropFirst
        
    def fit(self,X, y=None):
        join_df = pd.concat([train_data, test_data])
        join_df = join_df.select_dtypes(include=['object'])   
        if self.strategy is 'most_frequent':
        self.fill = {column: X[column].value_counts().index[0] for column in self.columns}
        
        else:
                self.fill = {column: '0' for column in self.columns}
                
            retun self
            
        def transform(self,X):
            X_copy = X.copy()
            for column in self.columns:
                X_copy[column] = X_copy[column].fillna(self.fill[column])
            return pd.get_dummies(x_copy, drop_First=self.dropFirst)

SyntaxError: invalid syntax (<ipython-input-1-0f2e3446a607>, line 1)

Complete categorial pipeline

In [None]:
cat_pipeline = pipeline(steps=[("cat_attr_selector", ColumnsSelector(type='Object')),
                              ("cat_imputer", CategorialImputer(columns=
                              ['workClass', 'Occupation', 'native-country']))("encoder",
                                CategorialEncoder(dropFirst=True))])

Complete Pipeline

In [None]:
full_pipeline = FeatureUnion([("num_pipe", num_pipeline), ("cat_pipeline", cat_pipeline)])

In [None]:
train_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)
test_data.drop(['fnlwgt', 'education'], axis=1, inplace=True)

Training the model

In [None]:
train_copy = train_data.copy()
train_copy["income"] = train_copy["income"].apply(lamba x:0 
                                                 if x=='<=50k' else 1)

X_train = train_copy.drop('income', axis=1)
Y_train = train_copy['income']


Next, we pass the x_train to the full_pipeline we built

In [None]:
X_train_processed=full_pipeline.fit_transform(x_train)

model = LogisticRegression(random_state=0)
model.fit(X_train_processed, Y_train)

Testing the model

In [None]:
test_copy = test_data.copy()
test_copy["income"] = test_copy["income"].apply(lamda x:0 if
                                               x=='<=50K.' else 1)

X_test = test_copy.drop('income', axis=1)
Y_test = test_copy['income']

In [None]:
X_test_processed = full_pipeline.fit_transform(X_test)

Predicted_classes = model.predict(X_test_processed)

Model Evaluation

we will use accuracy score from sklearn to find accuracy of the model


In [2]:
accuracy_score(predicted_classes, Y_test.values)

NameError: name 'accuracy_' is not defined