<a href="https://colab.research.google.com/github/JVerbeek/AML/blob/main/AML_A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesRegressor
from sklearn.tree import DecisionTreeClassifier
import sklearn.metrics as metrics
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
seed = 3141592
random.seed(seed)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/train_data.csv")
df.shape

(1593, 2651)

### Dealing with NAs
There are quite a number of columns that have many NAs. For now we drop all columns for which more than half of the observations contain NAs in that column. This leaves us with 1037 features.

In [None]:
y = df["target"]
X = df.drop("target", axis=1)

In [None]:
X[X.duplicated()]   # Nice, no duplicates!
X.shape
X_train_np, X_test_np, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0, test_size=0.1)


In [None]:
def prep(X, keys):
  if keys is None:
    print("train mode")
    isnas_500 = X.isna().sum() < int(X.shape[0]/2)  # More than 1/2 of the values are NA then drop
    isnas = X.isna().sum() == 0
    df_few_na = X.loc[:, isnas_500.values]
  else:
    print("test mode")
    df_few_na = X[keys]
  
  categorical = []
  continuous = []
  for key in df_few_na.keys():
    if len(np.unique(df[key].values)) < 10: # If fewer than 10 unique values then data is probably categorical
      categorical.append(key)
    else: 
      continuous.append(key)
  imp = SimpleImputer(strategy='mean')
  df_few_na[continuous] = imp.fit_transform(df_few_na[continuous])
  for key in categorical:
    df_few_na[key] = df_few_na[key].fillna(df_few_na[key].value_counts().idxmax())   # Replace categorical stuff with most occurring label
  return df_few_na


In [None]:
X_train = prep(X_train_np, keys=None)


train mode


Check: did imputing NAs work as expected?

In [None]:
assert X_train.isna().sum().sum() == 0

### Some rather uninteresting stuff, such as the model

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
pipe = make_pipeline(StandardScaler(), RandomForestClassifier(criterion='entropy', random_state=0, n_jobs=-1))
parameters = {'randomforestclassifier__n_estimators': [500], 'randomforestclassifier__max_depth': [1000], "randomforestclassifier__max_features":[0.5], "randomforestclassifier__min_samples_leaf":[1]}
gridsearch = GridSearchCV(pipe, parameters, cv=10, verbose=3)
gridsearch.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 1/10] END randomforestclassifier__max_depth=1000, randomforestclassifier__max_features=0.5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__n_estimators=500;, score=0.833 total time= 2.6min
[CV 2/10] END randomforestclassifier__max_depth=1000, randomforestclassifier__max_features=0.5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__n_estimators=500;, score=0.833 total time= 2.4min
[CV 3/10] END randomforestclassifier__max_depth=1000, randomforestclassifier__max_features=0.5, randomforestclassifier__min_samples_leaf=1, randomforestclassifier__n_estimators=500;, score=0.817 total time= 2.4min


In [None]:
X_test = prep(X_test_np, keys=X_train.keys())
y_pred = gridsearch.predict(X_test)
print(gridsearch.best_params_)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))