In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [2]:
income_data = pd.read_csv("../../input/income_data.txt", sep=", ", header=None)
income_data.iloc[10:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
12,23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,<=50K
13,32,Private,205019,Assoc-acdm,12,Never-married,Sales,Not-in-family,Black,Male,0,0,50,United-States,<=50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K


In [3]:
income_data = income_data.replace("?", np.NaN)
income_data = income_data.dropna()
income_data.iloc[10:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
12,23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,<=50K
13,32,Private,205019,Assoc-acdm,12,Never-married,Sales,Not-in-family,Black,Male,0,0,50,United-States,<=50K
15,34,Private,245487,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,Amer-Indian-Eskimo,Male,0,0,45,Mexico,<=50K


In [4]:
X_raw = income_data.iloc[:, :-1]
y_raw = income_data.iloc[:, -1]

In [5]:
column_encoders = {
    1: OneHotEncoder(sparse=False, drop="first"),
    3: LabelEncoder(),
    5: OneHotEncoder(sparse=False, drop="first"),
    6: OneHotEncoder(sparse=False, drop="first"),
    7: OneHotEncoder(sparse=False, drop="first"),
    8: OneHotEncoder(sparse=False, drop="first"),
    9: OneHotEncoder(sparse=False, drop="first"),
    13: OneHotEncoder(sparse=False, drop="first"),
}

def encode_data(data, train=False):
    for column in data.columns:
        if column in column_encoders:
            encoder = column_encoders[column]
            if isinstance(encoder, OneHotEncoder):
                encoded_column = encoder.fit_transform(data[column].to_numpy().reshape(-1, 1)) if train else encoder.transform(data[column].to_numpy().reshape(-1, 1))
                encoded_column_df = pd.DataFrame(encoded_column, columns=encoder.get_feature_names())

                data = pd.concat(
                    [data.reset_index(drop=True), encoded_column_df.reset_index(drop=True)], axis=1
                    ).drop(columns=[column])
            elif isinstance(encoder, LabelEncoder):
                data[column] = encoder.fit_transform(data[column]) if train else encoder.transform(data[column])
    return data

X = encode_data(X_raw, train=True)

In [6]:
y_encoder = LabelEncoder()
y = pd.Series(y_encoder.fit_transform(y_raw), name="14")

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
model = OneVsOneClassifier(LinearSVC(random_state=0))
model.fit(X_train, y_train)

OneVsOneClassifier(estimator=LinearSVC(random_state=0))

In [9]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [10]:
f1_score(y_test, y_pred, average="weighted")

0.7279946596522224

In [11]:
accuracy_score(y_test, y_pred)

0.7865004641294258

In [12]:
input_data = ['37', 'Private', '215646', 'HS-grad', '9', 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40', 'United-States']

encoded_input_data = encode_data(pd.DataFrame([input_data]))
prediction = model.predict(encoded_input_data)

In [13]:
y_encoder.inverse_transform(prediction)[0]

'<=50K'