In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import svm
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.preprocessing import LabelEncoder  

# https://archive.ics.uci.edu/ml/datasets/Adult

In [2]:
# Relabel columns
cols = ["Age", "Work", "FinalWeight", "Education", "EducationVal", "MarriageType",
        "Job", "Relationship", "Race", "Gender", "Gain", "Loss", "HoursWorked", "Country",
        "IncomeType"]

income_data = pd.read_csv('Data/adult.data', names=cols)
income_data_test = pd.read_csv('Data/adult.test', names=cols)
# First row of test set is garbage so remove; Concatenate UCI's test + train
income_data_test = income_data_test.iloc[1:]
income_df = pd.concat([income_data, income_data_test])
income_df['IncomeType'] = income_df['IncomeType'].map({' <=50K':1,' <=50K.':1,' >50K':-1, ' >50K.':-1})
income_df['IncomeType'].astype('int32')
income_df['IncomeType'].value_counts()

 1    37155
-1    11687
Name: IncomeType, dtype: int64

In [3]:
income_df.head()

Unnamed: 0,Age,Work,FinalWeight,Education,EducationVal,MarriageType,Job,Relationship,Race,Gender,Gain,Loss,HoursWorked,Country,IncomeType
0,39,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,1
1,50,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,1
2,38,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,1
3,53,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,1
4,28,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,1


In [4]:
print("Shape of income dataset: ", income_df.shape)

Shape of income dataset:  (48842, 15)


### Data Preprocessing

In [5]:
print("Any null values for income set: ", income_df.isnull().values.any())

Any null values for income set:  False


In [6]:
#Remove unnecessary columns
income_df.drop("Gain", axis=1, inplace=True)
income_df.drop("Loss", axis=1, inplace=True)
#Since education 
income_df.drop("Education", axis=1, inplace=True)

In [7]:
income_df.head()

Unnamed: 0,Age,Work,FinalWeight,EducationVal,MarriageType,Job,Relationship,Race,Gender,HoursWorked,Country,IncomeType
0,39,State-gov,77516.0,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,40.0,United-States,1
1,50,Self-emp-not-inc,83311.0,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,13.0,United-States,1
2,38,Private,215646.0,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,40.0,United-States,1
3,53,Private,234721.0,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40.0,United-States,1
4,28,Private,338409.0,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40.0,Cuba,1


In [8]:
income_df.columns

Index(['Age', 'Work', 'FinalWeight', 'EducationVal', 'MarriageType', 'Job',
       'Relationship', 'Race', 'Gender', 'HoursWorked', 'Country',
       'IncomeType'],
      dtype='object')

In [9]:
# # Check the categorical data
# print(income_df["Work"].value_counts())
# print('---')
# print(income_df["EducationVal"].value_counts())
# print('---')
# print(income_df["Job"].value_counts())
# print('---')
# print(income_df["Race"].value_counts())
# print('---')
# print(income_df["Gender"].value_counts())
# print('---')
# print(income_df["IncomeType"].value_counts())

In [10]:
le = LabelEncoder()

def data_transform(df):
    df['Work']= le.fit_transform(df['Work'])
    df['MarriageType']= le.fit_transform(df['MarriageType'])
    df['Job']= le.fit_transform(df['Job'])
    df['Relationship']= le.fit_transform(df['Relationship'])
    df['Race']= le.fit_transform(df['Race'])
    df['Gender']= le.fit_transform(df['Gender'])
    df['IncomeType']= le.fit_transform(df['IncomeType'])
    df['Country']= le.fit_transform(df['Country'])
    return df

income_df = data_transform(income_df)
# One hot encoding
# test = pd.get_dummies(income_df)

In [11]:
income_df.head()

Unnamed: 0,Age,Work,FinalWeight,EducationVal,MarriageType,Job,Relationship,Race,Gender,HoursWorked,Country,IncomeType
0,39,7,77516.0,13.0,4,1,1,4,1,40.0,39,1
1,50,6,83311.0,13.0,2,4,0,4,1,13.0,39,1
2,38,4,215646.0,9.0,0,6,1,4,1,40.0,39,1
3,53,4,234721.0,7.0,2,6,0,2,1,40.0,39,1
4,28,4,338409.0,13.0,2,10,5,2,0,40.0,5,1


In [12]:
income_df['IncomeType'].value_counts()

1    37155
0    11687
Name: IncomeType, dtype: int64

In [18]:
# Partition Data into Train and Test
income_y = income_df['IncomeType'].values
income_y = income_y.reshape(len(income_y), 1)
income_X_df = income_df.drop("IncomeType", axis=1)
income_X = income_X_df.values

In [19]:
income_y.shape

(48842, 1)

In [20]:
income_X.shape

(48842, 11)

In [21]:
from sklearn.model_selection import train_test_split
X_and_Y = np.hstack((income_X, income_y))     # Stack them together for shuffling.

X_train, X_test, y_train, y_test = train_test_split(income_X, income_y, test_size=0.33, random_state=42)

In [22]:
y_train = y_train.reshape(len(y_train))
y_test = y_test.reshape(len(y_test))

# from sklearn.preprocessing import LabelEncoder
# label_encoder_X = LabelEncoder()
# Y = label_encoder_X.fit_transform(Y)

In [23]:
from sklearn import svm
from sklearn.model_selection import train_test_split

def calc_error(X, Y, classifier):
    Y_pred = classifier.predict(X)
    e = 1 - accuracy_score(Y, Y_pred)
    return e

# Draw the heatmap of training errors.
def draw_heatmap(training_errors, gamma_list, C_list):
    # training_errors: A NumPy array with the shape (len(C_list), len(gamma_list))
    # gamma_list: List of gamma(s).
    # C_list: List of C(s).
    plt.figure(figsize = (5,4))
    ax = sns.heatmap(training_errors, annot=True, fmt='.3f', 
                     xticklabels=gamma_list, yticklabels=C_list)
    ax.collections[0].colorbar.set_label("error")
    ax.set(xlabel = '$\gamma$', ylabel='$C$')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title('Training error w.r.t $C$ and $\gamma$')
    plt.show()

In [None]:
C_list = [1, 10, 100, 1000, 10000]
gamma_list = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]

# An example of using draw_heatmap().
#    errors = np.random.random((len(C_list), len(gamma_list)))
#    draw_heatmap(errors, gamma_list, C_list)

opt_e_training = 1.0   # Optimal training error.
opt_classifier = None  # Optimal classifier.
opt_C          = None  # Optimal C.
opt_gamma      = None  # Optimal C.

# Training errors
training_errors = np.zeros((len(C_list), len(gamma_list)))

for i, C in enumerate(C_list):
    for j, gamma in enumerate(gamma_list):
        # Create a SVM classifier with RBF kernel.
        classifier = svm.SVC(kernel='rbf', C=C, gamma=gamma)

        # Use the classifier to fit the training set (use X_train, Y_train).
        classifier.fit(X_train, y_train)

        # Show decision boundary, training error and test error.
        e_training = calc_error(X_train, y_train, classifier)
        training_errors[i,j] = e_training
        
        if e_training < opt_e_training:
            opt_e_training = e_training
            opt_classifier = classifier
            opt_C          = C
            opt_gamma      = gamma

In [None]:
# from sklearn.utils import shuffle
# Later one to shuffle
draw_heatmap(training_errors, gamma_list, C_list)