# Challenge 1
## Team name: Team Hilbert Space
### Michael Moen Allport & Jonas Sandberg
### Student IDs: 768687 + 747903

In [1]:
import pandas as pd
import sklearn as sk
import numpy as np

In [2]:
with open('datasets/challenge1_train.csv') as train_csv:
    df = pd.read_csv(train_csv, skipinitialspace=True)
    
labels = df['target']
features = df.drop(columns=['target','id','f9'])

In [3]:
numeric_features     = ['f2', 'f18']
hexadecimal_features = ['f0', 'f7', 'f15', 'f23', 'f24']
boolean_features     = ['f1', 'f10', 'f11', 'f13', 'f22']
ordinal_features     = ['f4', 'f5', 'f6', 'f8', 'f12', 'f17', 'f19', 'f20']
categorical_features = ['f3', 'f14', 'f16', 'f21']

In [4]:
def conv_hex(x):
    try:
        return int(x, 16)
    except ValueError:
        return np.nan
    except TypeError:
        return np.nan

for f in hexadecimal_features:
    features[f] = features[f].apply(lambda x: conv_hex(x))

In [5]:
from sklearn.model_selection import train_test_split
X_train_pre, X_test_pre, y_train, y_test = train_test_split(features, labels, test_size=0.2, stratify=labels)

In [6]:
from imblearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numeric_transformer = make_pipeline(
    IterativeImputer(random_state=0),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='missing'),
    OneHotEncoder(handle_unknown='ignore')
)
    
preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features + hexadecimal_features + boolean_features + ordinal_features),
    (categorical_transformer, categorical_features),
)

X_train = pd.DataFrame(preprocessor.fit_transform(X_train_pre, y_train))
X_test  = pd.DataFrame(preprocessor.transform(X_test_pre))


In [7]:
from sklearn.metrics import roc_auc_score, f1_score, classification_report

def evaluate_model(model, X, y):
    
    y_pred = model.predict(X)
    y_pred_prob = model.predict_proba(X)[:,1]
    
    print(f"AUC: {roc_auc_score(y, y_pred_prob)}")
    print(f"F1: {f1_score(y, y_pred)}")

    print(classification_report(y, y_pred))

In [8]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import AdaBoostClassifier

model = make_pipeline(
    RandomOverSampler(random_state=0),
    AdaBoostClassifier(random_state=0, n_estimators=1000, learning_rate=1),
)

model.fit(X_train, y_train)

evaluate_model(model, X_test, y_test)

AUC: 0.744491390947271
F1: 0.4368354658497026
              precision    recall  f1-score   support

           0       0.90      0.69      0.78      8165
           1       0.33      0.66      0.44      1835

    accuracy                           0.69     10000
   macro avg       0.61      0.68      0.61     10000
weighted avg       0.80      0.69      0.72     10000



## Fit the model using all available training data
This time we don't reserve any data for testing, to maximize the usage of available data.

In [9]:
# Fit preprocessor to all labeled data, and transform X
X_all = pd.DataFrame(preprocessor.fit_transform(features, labels))
y_all = labels

# Read unlabeled data
with open('datasets/challenge1_test.csv') as test_csv:
    df_test = pd.read_csv(test_csv, skipinitialspace=True).drop(columns=['id', 'f9'])

# Transform unlabeled data in the same way as training data
for f in hexadecimal_features:
    df_test[f] = df_test[f].apply(lambda x: conv_hex(x))

X_unlabeled = pd.DataFrame(preprocessor.transform(df_test))

# Fit our model on all labeled data
model.fit(X_all, y_all)

# Over-optimistic evaluation using training data for testing (not to be taken seriously)
evaluate_model(model, X_all, y_all)

AUC: 0.7664749225634746
F1: 0.45447747908136016
              precision    recall  f1-score   support

           0       0.91      0.69      0.79     40826
           1       0.34      0.70      0.45      9174

    accuracy                           0.69     50000
   macro avg       0.62      0.69      0.62     50000
weighted avg       0.81      0.69      0.73     50000



## Predict on unlabeled data

In [12]:
y_pred = model.predict_proba(X_unlabeled)

pd.DataFrame(y_pred[:, 1]).to_csv("predictions.txt", header=False)