# Classifying Income Data Using Support Vector Machines

**Support Vector Machines** are classifiers that are defined using a hyperplane that separates classes  SVMs are good for binary classification problems, but can also be used for $N$ classes with a little bit of creativity.

The data set comes from [here](https://archive.ics.uci.edu/ml/datasets/Census+Income).

In [1]:
# Imports

import numpy
from matplotlib import pyplot
from sklearn import preprocessing
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn import model_selection
import ipywidgets as widgets
from IPython.display import display

import sys

sys.path.append("../")

from common import common_widgets

## Loading Data

In [2]:
data_file = 'income_data.txt'

under50 = 0
over50 = 0
records = []
classes = []

with open(data_file, 'r') as file:
    for line in file.readlines():
        if '?' not in line: # Don't want unknown data points
            row = line[:-1].split(", ") # Exclude the newline at the end
            
            # Watch out for empty arrays at the end
            if row[-1] == "<=50K":
                records.append(row)
                under50 += 1
            elif row[-1] == ">50K":
                records.append(row)
                over50 += 1

data = numpy.array(records)
records = data

label_encoders = []
encoded_data = numpy.empty(data.shape)
for i, item in enumerate(data[0]):
    # For each feature...
    if item.isdigit():
        # If this feature is already an integer, just leave it alone
        # Dump that entire column into the new encoded array
        encoded_data[:, i] = data[:, i]
    else:
        # This is text data, we should assign a label to it
        label_encoders.append(preprocessing.LabelEncoder())
        encoded_data[:, i] = label_encoders[-1].fit_transform(data[:, i])

# The data points are every feature except the income
data = encoded_data[:, :-1].astype(int)

# And here's the actual income classification
classes = encoded_data[:, -1].astype(int)

## Training the Model

In [3]:
split = model_selection.train_test_split(data, classes, test_size=0.2, random_state=0)
data_train, data_test = split[0], split[1]
class_train, class_test = split[2], split[3]
    
svc = LinearSVC(random_state=0)
classifier = OneVsOneClassifier(svc, n_jobs=-1)
classifier.fit(data_train, class_train)
    
# Make some guesses
predictions = classifier.predict(data_test)
    
f1 = model_selection.cross_val_score(classifier, data, classes, scoring="f1_weighted", cv=3)
print("F1 Score: {:.2f}%".format(f1.mean() * 100))

F1 Score: 71.35%


## Setting Up the Widgets

In [4]:
def dropdown(encoder, description):
    return widgets.Dropdown(
        options={value:index for index, value in enumerate(encoder.classes_)},
        description=description
    )

def intslider(data, description):
    min = data.min()
    return widgets.IntSlider(
        value=min,
        min=min,
        max=data.max(),
        description=description
    )

age = intslider(data[:, 0], "age")
workclass = dropdown(label_encoders[0], "workclass")
fnlwgt = intslider(data[:, 2], "fnlwgt")
education = dropdown(label_encoders[1], "education")
education_num = intslider(data[:, 4], "education-num")
marital_status = dropdown(label_encoders[2], "marital-status")
occupation = dropdown(label_encoders[3], "occupation")
relationship = dropdown(label_encoders[4], "relationship")
race = dropdown(label_encoders[5], "race")
sex = dropdown(label_encoders[6], "sex")
capital_gain = intslider(data[:, 10], "capital-gain")
capital_loss = intslider(data[:, 11], "capital-loss")
hours_per_week = intslider(data[:, 12], "hours-per-week")
native_country = dropdown(label_encoders[7], "native-country")

## Prediction

In [1]:
def predict(age, workclass, fnlwgt, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country):
    input_data = [age, workclass, fnlwgt, education, education_num, marital_status, occupation, relationship, race, sex, capital_gain, capital_loss, hours_per_week, native_country]
    encoded_input = [-1] * len(input_data)
    count = 0
    for i, feature in enumerate(input_data):
        if type(feature) == int:
            encoded_input[i] = feature
        else:
            encoded_input[i] = int(label_encoder[count].transform(input_data[i]))
            count += 1
    

    encoded_input = numpy.array(encoded_input)
    predicted_class = classifier.predict([encoded_input])
    predicted_class_label = label_encoders[-1].inverse_transform(predicted_class)[0]
    print(predicted_class_label)

widgets.interactive(
    predict,
    age=age,
    workclass=workclass,
    fnlwgt=fnlwgt,
    education=education,
    education_num=education_num,
    marital_status=marital_status,
    occupation=occupation,
    relationship=relationship,
    race=race,
    sex=sex,
    capital_gain=capital_gain,
    capital_loss=capital_loss,
    hours_per_week=hours_per_week,
    native_country=native_country
)

NameError: name 'widgets' is not defined