In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

warnings.filterwarnings('ignore')

df = pd.read_csv('congressional_voting_dataset.csv')
df

Unnamed: 0,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa,political_party
0,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y,republican
1,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?,republican
2,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n,democrat
3,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y,democrat
4,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y,democrat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,n,n,y,y,y,y,n,n,y,y,n,y,y,y,n,y,republican
431,n,n,y,n,n,n,y,y,y,y,n,n,n,n,n,y,democrat
432,n,?,n,y,y,y,n,n,n,n,y,y,y,y,n,y,republican
433,n,n,n,y,y,y,?,?,?,?,n,y,y,y,n,y,republican


## Po namyśle uznaliśmy, że "?" nie należy traktować jako brak danych, ale jako oddzielną klasę.
#### Użyliśmy naturalnego dla nas mapowania zbioru {-1, 0, 1}.
#### Zamieniliśmy też od razu kolumnę political_party -> {republican, democrat} na is_republican -> {1,0}.

In [2]:
d = {'y':1, 'n':-1, '?':0, 'democrat': 0, 'republican':1}
for i in df.columns:
    df[i] = df[i].map(d)
df.rename(columns={'political_party': 'is_republican'})

Unnamed: 0,handicapped_infants,water_project_cost_sharing,adoption_of_the_budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_to_nicaraguan_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_administration_act_south_africa,is_republican
0,-1,1,-1,1,1,1,-1,-1,-1,1,0,1,1,1,-1,1,1
1,-1,1,-1,1,1,1,-1,-1,-1,-1,-1,1,1,1,-1,0,1
2,0,1,1,0,1,1,-1,-1,-1,-1,1,-1,1,1,-1,-1,0
3,-1,1,1,-1,0,1,-1,-1,-1,-1,1,-1,1,-1,-1,1,0
4,1,1,1,-1,1,1,-1,-1,-1,-1,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,-1,-1,1,1,1,1,-1,-1,1,1,-1,1,1,1,-1,1,1
431,-1,-1,1,-1,-1,-1,1,1,1,1,-1,-1,-1,-1,-1,1,0
432,-1,0,-1,1,1,1,-1,-1,-1,-1,1,1,1,1,-1,1,1
433,-1,-1,-1,1,1,1,0,0,0,0,-1,1,1,1,-1,1,1


## Modelowanie
Podział na zbiór treningowy i testowy

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:16]], df['political_party'], test_size=0.3, random_state=42)

Wybranie modeli

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from xgboost import XGBClassifier

models = [
    DecisionTreeClassifier(random_state=1, max_depth=3),
    KNeighborsClassifier(),
    LogisticRegression(random_state=1, max_iter=1000),
    BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1, max_depth=3), n_estimators=10, random_state=1),
    BaggingClassifier(base_estimator=KNeighborsClassifier(), n_estimators=10, random_state=42),
    BaggingClassifier(base_estimator=LogisticRegression(random_state=1, max_iter=1000), n_estimators=10, random_state=42),
    RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split = 5, max_features = 3, random_state=1, n_jobs = -1),
    AdaBoostClassifier(random_state=1),
    GradientBoostingClassifier(random_state=1, learning_rate=0.01),
    #XGBClassifier(random_state=1, learning_rate=0.01, booster='gbtree', max_depth=4, eval_metric="logloss", use_label_encoder=False)
]

indexes = ["AUC", "f1", "accuracy", "precision", "recall"]
scores = pd.DataFrame(0, index=indexes, columns=[])

In [40]:
model_soft = VotingClassifier(estimators=[("LogisticRegression",LogisticRegression(random_state=1, max_iter=1000)),
                                         ("BaggingClassifier",BaggingClassifier(base_estimator=LogisticRegression(random_state=1, max_iter=1000), n_estimators=10, random_state=42)),
                                        ("AdaBoostClassifier",AdaBoostClassifier(random_state=1))], voting='soft')
models.append(model_soft)

Kroswalidacja

In [41]:
from sklearn.model_selection import cross_val_score
i=1
for model in models:
    scr = cross_val_score(model, X_train, y_train, scoring='f1', cv = 10).mean()
    scores.at["f1",str(i) + '. ' + type(model).__name__]=scr
    print()
    print(type(model).__name__, scr)
    i=i+1


DecisionTreeClassifier 0.91255534159882

KNeighborsClassifier 0.9337749287749286

LogisticRegression 0.9468454106280193

BaggingClassifier 0.9247829803047196

BaggingClassifier 0.9226752136752137

BaggingClassifier 0.9471417069243158

RandomForestClassifier 0.9429083364300757

AdaBoostClassifier 0.9468423138857922

GradientBoostingClassifier 0.9188052768487551

VotingClassifier 0.9468454106280193


In [42]:
i=1
for model in models:
    scr = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv = 10).mean()
    scores.at["AUC",str(i) + '. ' + type(model).__name__]=scr
    print()
    print(type(model).__name__, scr)
    i=i+1


DecisionTreeClassifier 0.9670368496026391

KNeighborsClassifier 0.9737085769980507

LogisticRegression 0.9909356725146198

BaggingClassifier 0.9796024516419253

BaggingClassifier 0.9749147173489279

BaggingClassifier 0.9923001949317738

RandomForestClassifier 0.986802744039586

AdaBoostClassifier 0.9883472034787826

GradientBoostingClassifier 0.9779239766081871

VotingClassifier 0.991398635477583


In [43]:
i=1
for model in models:
    scr = cross_val_score(model, X_train, y_train, scoring='accuracy', cv = 10).mean()
    scores.at["accuracy",str(i) + '. ' + type(model).__name__]=scr
    print()
    print(type(model).__name__, scr)
    i=i+1


DecisionTreeClassifier 0.9307526881720432

KNeighborsClassifier 0.9438709677419356

LogisticRegression 0.956989247311828

BaggingClassifier 0.9373118279569892

BaggingClassifier 0.9339784946236559

BaggingClassifier 0.9570967741935485

RandomForestClassifier 0.953763440860215

AdaBoostClassifier 0.9570967741935485

GradientBoostingClassifier 0.9341935483870968

VotingClassifier 0.956989247311828


In [44]:
i=1
for model in models:
    scr = cross_val_score(model, X_train, y_train, scoring='recall', cv = 10).mean()
    scores.at["recall",str(i) + '. ' + type(model).__name__]=scr
    print()
    print(type(model).__name__, scr)
    i=i+1


DecisionTreeClassifier 0.908974358974359

KNeighborsClassifier 0.9749999999999999

LogisticRegression 0.9499999999999998

BaggingClassifier 0.9423076923076922

BaggingClassifier 0.9666666666666666

BaggingClassifier 0.9499999999999998

RandomForestClassifier 0.9499999999999998

AdaBoostClassifier 0.9499999999999998

GradientBoostingClassifier 0.9173076923076922

VotingClassifier 0.9499999999999998


In [45]:
i=1
for model in models:
    scr = cross_val_score(model, X_train, y_train, scoring='precision', cv = 10).mean()
    scores.at["precision",str(i) + '. ' + type(model).__name__]=scr
    print()
    print(type(model).__name__, scr)
    i=i+1


DecisionTreeClassifier 0.9217582417582418

KNeighborsClassifier 0.8980402930402931

LogisticRegression 0.9473076923076924

BaggingClassifier 0.9117765567765568

BaggingClassifier 0.8847802197802197

BaggingClassifier 0.9478571428571427

RandomForestClassifier 0.9381868131868132

AdaBoostClassifier 0.947069597069597

GradientBoostingClassifier 0.9242307692307692

VotingClassifier 0.9473076923076924


In [46]:
scores =scores.transpose()

scores['mean'] = scores.mean(axis=1)

scores.sort_values("mean", ascending=False)

Unnamed: 0,AUC,f1,accuracy,precision,recall,mean
6. BaggingClassifier,0.9923,0.947142,0.957097,0.947857,0.95,0.958879
10. VotingClassifier,0.991399,0.946845,0.956989,0.947308,0.95,0.958508
3. LogisticRegression,0.990936,0.946845,0.956989,0.947308,0.95,0.958416
8. AdaBoostClassifier,0.988347,0.946842,0.957097,0.94707,0.95,0.957871
7. RandomForestClassifier,0.986803,0.942908,0.953763,0.938187,0.95,0.954332
2. KNeighborsClassifier,0.973709,0.933775,0.943871,0.89804,0.975,0.944879
4. BaggingClassifier,0.979602,0.924783,0.937312,0.911777,0.942308,0.939156
5. BaggingClassifier,0.974915,0.922675,0.933978,0.88478,0.966667,0.936603
9. GradientBoostingClassifier,0.977924,0.918805,0.934194,0.924231,0.917308,0.934492
1. DecisionTreeClassifier,0.967037,0.912555,0.930753,0.921758,0.908974,0.928215


In [18]:
for model in models:
    model.fit(X_train, y_train)
    print()
    print(type(model).__name__, model.score(X_test, y_test))


DecisionTreeClassifier 0.9465648854961832

KNeighborsClassifier 0.9541984732824428

LogisticRegression 0.9847328244274809

BaggingClassifier 0.9618320610687023

BaggingClassifier 0.9541984732824428

BaggingClassifier 0.9847328244274809

RandomForestClassifier 0.9541984732824428

AdaBoostClassifier 0.9694656488549618

GradientBoostingClassifier 0.9770992366412213

XGBClassifier 0.9694656488549618


In [47]:
final_models = [LogisticRegression(random_state=1), 
                BaggingClassifier(base_estimator=LogisticRegression(random_state=1, max_iter=1000), random_state=42),
                AdaBoostClassifier(random_state=1)]

In [53]:
parametersRegression = {
    'max_iter': [1000, 500, 3000, 2000],
    'tol': [1e-4, 1e-6,1e-3]
}

clf_grid = GridSearchCV(LogisticRegression(random_state=1), parametersRegression, cv=4)

clf_grid.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=LogisticRegression(random_state=1),
             param_grid={'max_iter': [1000, 500, 3000, 2000],
                         'tol': [0.0001, 1e-06, 0.001]})

In [54]:
pd.options.display.max_colwidth = 100
pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
0,"{'max_iter': 1000, 'tol': 0.0001}",0.957237
1,"{'max_iter': 1000, 'tol': 1e-06}",0.957237
2,"{'max_iter': 1000, 'tol': 0.001}",0.957237
3,"{'max_iter': 500, 'tol': 0.0001}",0.957237
4,"{'max_iter': 500, 'tol': 1e-06}",0.957237
5,"{'max_iter': 500, 'tol': 0.001}",0.957237
6,"{'max_iter': 3000, 'tol': 0.0001}",0.957237
7,"{'max_iter': 3000, 'tol': 1e-06}",0.957237
8,"{'max_iter': 3000, 'tol': 0.001}",0.957237
9,"{'max_iter': 2000, 'tol': 0.0001}",0.957237


In [56]:
parametersBagging = {
    'n_estimators': [10,5,15,3],
    'max_samples': [1.0,.5,.3,.6],
    'max_features': [1.0,.5,.3,.6],
    'bootstrap_features': [True, False]
}

clf_grid = GridSearchCV(BaggingClassifier(base_estimator=LogisticRegression(random_state=1, max_iter=1000), random_state=42), parametersBagging, cv=4)

clf_grid.fit(X_train, y_train)

GridSearchCV(cv=4,
             estimator=BaggingClassifier(base_estimator=LogisticRegression(max_iter=1000,
                                                                           random_state=1),
                                         random_state=42),
             param_grid={'bootstrap_features': [True, False],
                         'max_features': [1.0, 0.5, 0.3, 0.6],
                         'max_samples': [1.0, 0.5, 0.3, 0.6],
                         'n_estimators': [10, 5, 15, 3]})

In [57]:
pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
3,"{'bootstrap_features': True, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 3}",0.963816
83,"{'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 3}",0.960526
60,"{'bootstrap_features': True, 'max_features': 0.6, 'max_samples': 0.6, 'n_estimators': 10}",0.960526
68,"{'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 10}",0.960526
70,"{'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 15}",0.960526
...,...,...
101,"{'bootstrap_features': False, 'max_features': 0.3, 'max_samples': 0.5, 'n_estimators': 5}",0.891447
45,"{'bootstrap_features': True, 'max_features': 0.3, 'max_samples': 0.6, 'n_estimators': 5}",0.888158
41,"{'bootstrap_features': True, 'max_features': 0.3, 'max_samples': 0.3, 'n_estimators': 5}",0.881579
33,"{'bootstrap_features': True, 'max_features': 0.3, 'max_samples': 1.0, 'n_estimators': 5}",0.881579


In [62]:
parametersAda = {
    'n_estimators': [50,35,75,100],
    'learning_rate': [1.0,.5,.7]
}

clf_grid = GridSearchCV(AdaBoostClassifier(random_state=1), parametersAda, cv=4)
clf_grid.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=AdaBoostClassifier(random_state=1),
             param_grid={'learning_rate': [1.0, 0.5, 0.7],
                         'n_estimators': [50, 35, 75, 100]})

In [63]:
pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score
9,"{'learning_rate': 0.7, 'n_estimators': 35}",0.960526
5,"{'learning_rate': 0.5, 'n_estimators': 35}",0.957237
0,"{'learning_rate': 1.0, 'n_estimators': 50}",0.953947
3,"{'learning_rate': 1.0, 'n_estimators': 100}",0.953947
4,"{'learning_rate': 0.5, 'n_estimators': 50}",0.953947
6,"{'learning_rate': 0.5, 'n_estimators': 75}",0.953947
7,"{'learning_rate': 0.5, 'n_estimators': 100}",0.953947
8,"{'learning_rate': 0.7, 'n_estimators': 50}",0.953947
10,"{'learning_rate': 0.7, 'n_estimators': 75}",0.953947
11,"{'learning_rate': 0.7, 'n_estimators': 100}",0.953947
