In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import itertools
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from auxiliar_func import *
from plot_func import *

In [2]:
df = pd.read_csv('Census-Income-KDD.csv')
target = 'income_50k'
df_tr, df_te = train_test_split(df, test_size=0.3, random_state=42)

In [4]:
params = {
    'scaling': [None],
    'imputation': ['mode'],
    'cat_age': [False, True],
    'target_freq': [0.8, 0.9, 1.0]
}

svm = LinearSVC(random_state=42, max_iter=10000, dual=False)
results = test_preprocess_params(df_tr, svm, params)

results.to_csv('results_svm.csv', index=False)

Adjusting for (None, 'mode', False, 0.8)
Adjusting for (None, 'mode', False, 0.9)
Adjusting for (None, 'mode', False, 1.0)
Adjusting for (None, 'mode', True, 0.8)
Adjusting for (None, 'mode', True, 0.9)
Adjusting for (None, 'mode', True, 1.0)


In [5]:
df_tr_pre = preprocessing(df_tr, scaling=None, imputation='mode', cat_age=False, target_freq=0.8)
df_te_pre = preprocessing(df_te, scaling=None, imputation='mode', cat_age=False, remove_duplicates=False)

df_tr_pre, df_te_pre = df_tr_pre.align(df_te_pre, join='left', axis=1, fill_value=0)

X_train, y_train = df_tr_pre.drop(target, axis=1), df_tr_pre[target]
X_test, y_test = df_te_pre.drop(target, axis=1), df_te_pre[target]

# adjust a svm model
params = {
    'C': [0.05, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 1.5}, {0: 1, 1: 2}, {0: 1, 1: 2.5}, {0: 1, 1: 3}]
}

svm = LinearSVC(random_state=42, max_iter=10000, dual=False)
svm = GridSearchCV(svm, params, cv=5, scoring='f1_macro', n_jobs=-1)
svm.fit(X_train, y_train)

print('Best params: ', svm.best_params_)
print('Best score: ', svm.best_score_)
print('Test score: ', svm.score(X_test, y_test))

Best params:  {'C': 0.1, 'class_weight': {0: 1, 1: 1.5}, 'penalty': 'l1'}
Best score:  0.8180948891504711
Test score:  0.756865182625934
