In [None]:
from google.colab import drive, files
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pip install pyarrow



In [None]:
!pip install function-pipe



In [None]:
# Libraries

import os
import io
import math
import pandas as pd
import numpy as np
import numba as nb
import pyarrow as pa
import function_pipe as fpn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from pyarrow import csv
from sklearn.tree import DecisionTreeClassifier
from copy import deepcopy
from abc import ABC, abstractmethod


# Declaration

In [None]:
DATASET_FILENAME = '/content/gdrive/MyDrive/NYPD_Complaint_Data_Historic.csv'
PALETTE = sns.color_palette('winter', as_cmap=True)
THRESHOLD = 0
UNUSED_LABELS = ['STATION_NAME', 'TRANSIT_DISTRICT', 'PARKS_NM'] # note: these labels are completely or mostly empty
UNKNOWN_VALUES = ['UNKNOWN', 'U', '']
LABELS_TO_BE_USED = [
  'CMPLNT_NUM',
  'BORO_NM',
  'OFNS_DESC',
  'CMPLNT_FR_DT',
  'CMPLNT_FR_TM',
  'CMPLNT_TO_DT',
  'CMPLNT_TO_TM',
  'HOUSING_PSA',
  'LAW_CAT_CD',
  'SUSP_AGE_GROUP',
  'SUSP_RACE',
  'SUSP_SEX',
  'VIC_AGE_GROUP',
  'VIC_RACE',
  'VIC_SEX',
  'Latitude',
  'Longitude'
]

In [None]:
table = csv.read_csv(DATASET_FILENAME)
raw_data = table.to_pandas(split_blocks=True, self_destruct=True)
del table

# Pre-processing

In [None]:
def preprocess_data(raw_data):
  def get_threshold(threshold):
    return len(raw_data.columns)-len(UNUSED_LABELS)-threshold

  def sort_unique(arr):
    return np.sort(arr.unique())

  # technically binary encoding
  def to_numeric_encoding(feature):
    return feature.replace({ 
      offence:i for (i, offence) in enumerate(sort_unique(feature)) 
    })

  @fpn.FunctionNode
  def preprocess_boro_nm(data):
    # Maria's code (probably? not mine tho)
    data['BORO_NM'] = data["BORO_NM"].replace({ "BROOKLYN": 0,"BRONX" : 1, "MANHATTAN" : 2, "QUEENS": 3, "STATEN ISLAND" : 4})
    return data

  @fpn.FunctionNode
  def preprocess_ofns_desc(data):
    # Maria's code (probably? not mine tho)
    counts = data['OFNS_DESC'].value_counts()
    data = data[~data['OFNS_DESC'].isin(counts[counts < 1000].index)]
    # My code
    CLASSES = sort_unique(data['OFNS_DESC'])
    data['OFNS_DESC'] = to_numeric_encoding(data['OFNS_DESC'])
    return CLASSES, data
  
  @fpn.FunctionNode
  def preprocess_datetime(data):
    def time_to_numeric(time):
      return time.hour * 60 + time.minute

    time_arr_to_numeric = np.vectorize(time_to_numeric)

    # Based on Maria's code (probably?)
    data['CMPLNT_FR_DT'] = pd.to_datetime(data['CMPLNT_FR_DT'], format='%m/%d/%Y')
    data['CMPLNT_FR_TM'] = time_arr_to_numeric(data['CMPLNT_FR_TM'])
    data = data.drop( data[ data['CMPLNT_FR_DT'] < pd.Timestamp(2010,1,1) ].index)
    data['CMPLNT_TO_DT'] = pd.to_datetime(data['CMPLNT_TO_DT'], format='%m/%d/%Y')
    data['CMPLNT_TO_TM'] = pd.to_timedelta(data['CMPLNT_TO_TM'])
    # My code
    data.loc[:,'CMPLNT_FR_DT'] = pd.to_numeric(data.loc[:,'CMPLNT_FR_DT'])
    data.loc[:,'CMPLNT_TO_DT'] = pd.to_numeric(data.loc[:,'CMPLNT_TO_DT'])
    data.loc[:,'CMPLNT_FR_TM'] = pd.to_numeric(data.loc[:,'CMPLNT_FR_TM'])
    data.loc[:,'CMPLNT_TO_TM'] = pd.to_numeric(data.loc[:,'CMPLNT_TO_TM'])
    data.loc[:,'Duration'] = pd.to_numeric(data.loc[:,'CMPLNT_TO_DT'] - data.loc[:,'CMPLNT_FR_DT']) + (data.loc[:,'CMPLNT_TO_TM']  - data.loc[:,'CMPLNT_FR_TM'])
    return data

  @fpn.FunctionNode
  def preprocess_housing_psa(data):
    # Maria's code (probably?)
    # remove HOUSING_PSA with less than 10 
    counts = data.loc[:,'HOUSING_PSA'].value_counts()
    data.loc[:,:] = data[~data.loc[:,'HOUSING_PSA'].isin(counts[counts < 10].index)]
    # mine
    data['HOUSING_PSA'] = to_numeric_encoding(data['HOUSING_PSA'].astype(str))
    return data

  @fpn.FunctionNode
  def preprocess_law_cat_cd(data):
    # Maria's code (probably?)
    data.loc[:,'LAW_CAT_CD'] = data["LAW_CAT_CD"].replace({ "MISDEMEANOR": 0, "VIOLATION" : 1, "FELONY" : 2})
    return data

  @fpn.FunctionNode
  def preprocess_sex(data):
    # mine
    data['VIC_SEX'] = pd.get_dummies(data["VIC_SEX"])['M']
    data["SUSP_SEX"] = pd.get_dummies(data["SUSP_SEX"])['M']
    return data

  @fpn.FunctionNode
  def preprocess_age_group(data):
    # Maria's code (probably?)
    data['SUSP_AGE_GROUP'] = data["SUSP_AGE_GROUP"].replace({ "<18": 0,"18-24" : 1, "25-44" : 2, "45-64": 3, "65+" : 4})
    data['VIC_AGE_GROUP'] = data["VIC_AGE_GROUP"].replace({ "<18": 0,"18-24" : 1, "25-44" : 2, "45-64": 3, "65+" : 4})

    data['SUSP_AGE_GROUP'] = pd.to_numeric(data['SUSP_AGE_GROUP'])
    data['VIC_AGE_GROUP'] = pd.to_numeric(data['VIC_AGE_GROUP'])

    data = data.drop(data[(data['SUSP_AGE_GROUP'] > 4) | (data['SUSP_AGE_GROUP'] < 0)].index)
    data = data.drop(data[(data['VIC_AGE_GROUP'] > 4) | (data['VIC_AGE_GROUP'] < 0)].index)
    return data

  @fpn.FunctionNode
  def preprocess_race(data):
    def one_hot_race(feature):
      oh_feature = pd.get_dummies(feature)
      oh_feature['HISPANIC'] = oh_feature['BLACK HISPANIC'] | oh_feature['WHITE HISPANIC']
      oh_feature['BLACK'] |= oh_feature['BLACK HISPANIC']
      oh_feature['WHITE'] |= oh_feature['WHITE HISPANIC']
      oh_feature.drop(columns=['BLACK HISPANIC','WHITE HISPANIC'], axis=1, inplace=True)
      return oh_feature

    oh_vic_race = one_hot_race(data['VIC_RACE'])
    oh_susp_race = one_hot_race(data['SUSP_RACE'])

    for race in oh_vic_race:
      data['VIC_' + race] = oh_vic_race[race]

    for race in oh_susp_race:
      data['SUSP_' + race] = oh_susp_race[race]

    data.drop(columns=['VIC_RACE','SUSP_RACE'], axis=1, inplace=True)
    return data

  data = raw_data \
  .drop(columns=UNUSED_LABELS) \
  .replace({ val:np.nan for val in UNKNOWN_VALUES }) \
  .dropna(thresh=get_threshold(0)) \
  .loc[:, LABELS_TO_BE_USED]
  
  CLASSES, data = (preprocess_boro_nm >> preprocess_ofns_desc)(data)

  return [
    CLASSES,
    (
        preprocess_datetime >>
        preprocess_housing_psa >>
        preprocess_law_cat_cd >>
        preprocess_sex >>
        preprocess_age_group >>
        preprocess_race
     
    )(data).dropna()
  ]

CLASSES, data = preprocess_data(raw_data)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
data

Unnamed: 0,CMPLNT_NUM,BORO_NM,OFNS_DESC,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,HOUSING_PSA,LAW_CAT_CD,SUSP_AGE_GROUP,...,VIC_AMERICAN INDIAN/ALASKAN NATIVE,VIC_ASIAN / PACIFIC ISLANDER,VIC_BLACK,VIC_WHITE,VIC_HISPANIC,SUSP_AMERICAN INDIAN/ALASKAN NATIVE,SUSP_ASIAN / PACIFIC ISLANDER,SUSP_BLACK,SUSP_WHITE,SUSP_HISPANIC
33,593134992.0,0.0,2.0,1.374278e+18,1110.0,1.374278e+18,6.672000e+13,49,2.0,2.0,...,0,0,1,0,0,0,0,1,0,0
168,563768013.0,0.0,0.0,1.497485e+18,210.0,1.497485e+18,2.250000e+13,57,0.0,2.0,...,0,0,1,0,0,0,0,1,0,0
178,677977125.0,1.0,0.0,1.480550e+18,181.0,1.480550e+18,1.482000e+13,404,0.0,2.0,...,0,0,0,1,1,0,0,0,1,1
247,380439115.0,0.0,0.0,1.456013e+18,1225.0,1.456013e+18,7.380000e+13,252,0.0,2.0,...,0,0,1,0,0,0,0,1,0,0
266,991203856.0,2.0,5.0,1.404691e+18,1063.0,1.404691e+18,6.480000e+13,310,0.0,2.0,...,0,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6982942,368358311.0,0.0,3.0,1.512432e+18,660.0,1.512432e+18,4.050000e+13,208,1.0,3.0,...,0,0,1,0,0,0,0,1,0,0
6983012,622273511.0,2.0,3.0,1.520726e+18,1230.0,1.520726e+18,7.560000e+13,351,1.0,3.0,...,0,0,1,0,1,0,0,0,1,1
6983079,808144426.0,2.0,0.0,1.517011e+18,219.0,1.517011e+18,1.404000e+13,318,0.0,2.0,...,0,0,1,0,1,0,0,0,1,1
6983092,255356017.0,0.0,8.0,1.543450e+18,1355.0,1.543450e+18,8.160000e+13,222,2.0,2.0,...,0,0,0,1,0,0,0,1,0,0


In [None]:
data.dtypes

CMPLNT_NUM                             float64
BORO_NM                                float64
OFNS_DESC                              float64
CMPLNT_FR_DT                           float64
CMPLNT_FR_TM                           float64
CMPLNT_TO_DT                           float64
CMPLNT_TO_TM                           float64
HOUSING_PSA                              int64
LAW_CAT_CD                             float64
SUSP_AGE_GROUP                         float64
SUSP_SEX                                 uint8
VIC_AGE_GROUP                          float64
VIC_SEX                                  uint8
Latitude                               float64
Longitude                              float64
Duration                               float64
VIC_AMERICAN INDIAN/ALASKAN NATIVE       uint8
VIC_ASIAN / PACIFIC ISLANDER             uint8
VIC_BLACK                                uint8
VIC_WHITE                                uint8
VIC_HISPANIC                             uint8
SUSP_AMERICAN

# Train

## Split training set

In [None]:
# Based on https://stackoverflow.com/questions/68279596/split-pandas-dataframe-by-label-with-ratio
# Splitting set into training set and test set
def split_training_set(data):
  training_set = data.sample(frac=0.80, random_state = 0) 
  test_set = data.drop(training_set.index)
  train_x, train_y = training_set.drop(columns=['OFNS_DESC']), training_set.loc[:, 'OFNS_DESC']
  test_x, test_y = test_set.drop(columns=['OFNS_DESC']), test_set.loc[:, 'OFNS_DESC']
  return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = split_training_set(data)


In [None]:
train_x.dtypes

CMPLNT_NUM                             float64
BORO_NM                                float64
CMPLNT_FR_DT                           float64
CMPLNT_FR_TM                           float64
CMPLNT_TO_DT                           float64
CMPLNT_TO_TM                           float64
HOUSING_PSA                              int64
LAW_CAT_CD                             float64
SUSP_AGE_GROUP                         float64
SUSP_SEX                                 uint8
VIC_AGE_GROUP                          float64
VIC_SEX                                  uint8
Latitude                               float64
Longitude                              float64
Duration                               float64
VIC_AMERICAN INDIAN/ALASKAN NATIVE       uint8
VIC_ASIAN / PACIFIC ISLANDER             uint8
VIC_BLACK                                uint8
VIC_WHITE                                uint8
VIC_HISPANIC                             uint8
SUSP_AMERICAN INDIAN/ALASKAN NATIVE      uint8
SUSP_ASIAN / 

## Boost

### Boosting Classifier Class
This class is a base or abstract class for all boosting methods. It contains the fit, predict, and predict_proba methods, which can also be found on the boosting methods from SciKit.

In [None]:
class BoostingClassifier(ABC):
  def __init__(
    self,
    n_iterations,
    weak_learner,
    loss
  ):
    self.n_iterations = n_iterations
    self.weak_learner = weak_learner
    self.loss = loss
    self.models = np.full(n_iterations, None)
    self.accuracy = None

  @abstractmethod
  def fit(self, dataset_x: np.ndarray, dataset_y: np.ndarray):
    pass
  
  @abstractmethod
  def predict(self, dataset_x: np.ndarray, dataset_y: np.ndarray):
    pass

  @abstractmethod
  def predict_proba(self, dataset_x: np.ndarray, dataset_y: np.ndarray):
    pass

  def calculate_accuracy(self, dataset_x: np.ndarray, dataset_y: np.ndarray):
    return 100*sum(self.predict(dataset_x) == dataset_y)/dataset_y.size


### Adaboost

This class is an implementation of AdaBoost, which is a subclass of BoostingClassifier. The `fit` method and `predict` method are based on the algorithms/pseudocode found on [this lecture](https://www.cs.toronto.edu/~mbrubake/teaching/C11/Handouts/AdaBoost.pdf).

#### Loss function
Uses ~~`np.int64`~~ `np.int8` (note: I just changed it to `np.int8`. I have not tested it yet, so I have no idea if it has side effects or not.) to convert `True` to `1` and `False` to `0`. The conditional that is passed to this loss function ($I$) is `f_iteration.predict(dataset_x[i].reshape(1,-1)) != dataset_y[i]` or $f_m(x_i) \neq y_i$. As seen on the lecture, if $f_m(x_i) \neq y_i$ is true, the output of $I$ is $1$. If not, the output of $I$ is $0$.

In the lecture I

#### Fit method
Based on the "AdaBoost algorithm" found on the lecture.

This method uses `deepcopy` since Python is a bit finnicky when it comes to assigning variables. It's a small Band-Aid I used to fix a bug that might take a long time to fix.

#### Predict method
In SciKit, the `predict` method for AdaBoost is the "weighted mean prediction" ([source](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier.predict)). This method technically does not exist, but since BoostingClassifier requires a `predict` method, The later secion is used as the predict method.

##### Binary Predict method
This is the method that is used by MultiBoost classifier. This is based on the `predict` algorithm or rather $g(x)$ of the algorithm on the lecture.
As mentioned, the algorithm from the paper is $g(x) = \text{sign}(\sum_{m=1}^{M}\alpha_mf_m(x))$ .


#### Predict_proba method
As of now, this method does nothing, but it will be a very important method to MultiBoost once completed (check later section).



In [None]:
# resources:
# https://www.cs.toronto.edu/~mbrubake/teaching/C11/Handouts/AdaBoost.pdf
class AdaBoost(BoostingClassifier):
  def __init__(self, n_iterations, weak_learner):
    super().__init__(
        n_iterations=n_iterations,
        loss=self.loss,
        weak_learner=weak_learner
    )

  def loss(self, conditional):
    return np.int64(conditional)

  def fit(self, dataset_x, dataset_y):
    # Initialize weight distribution vector
    # Note: N = dataset size, M =  num_of_classifiers/iteartions
    dataset_x = np.float64(dataset_x)
    dataset_y = np.float64(dataset_y)
    dataset_size = dataset_y.size
    weight_distribution_vector = np.ones(dataset_size) * (1/dataset_size)  
    self.models = np.full(self.n_iterations, None)
    for iteration in range(1, self.n_iterations+1): # for m = 1 to M do
      f_iteration = deepcopy(self.weak_learner).fit(dataset_x, dataset_y, sample_weight=weight_distribution_vector)
      Ɛ_iteration = sum(
          [
            weight_distribution_vector[i] * \
            self.loss(f_iteration.predict(dataset_x[i].reshape(1,-1)) != dataset_y[i]) \
            for i in range(dataset_size)
          ]
      ) / weight_distribution_vector.sum()
      α_iteration = np.log((1- Ɛ_iteration)/Ɛ_iteration)
      for i in range(dataset_size):
        weight_distribution_vector[i] = \
          weight_distribution_vector[i] * \
          np.exp(
            α_iteration * self.loss(f_iteration.predict(dataset_x[i].reshape(1,-1)) != dataset_y[i])
          )
      self.models[iteration-1] = (α_iteration, f_iteration)
    return self

  def predict_proba(self, dataset_x):
    return self.models[-1][-1].predict_proba(np.float64(dataset_x))

  #  g(x) = np.sign(sum(m=1, M, alpha_m * f_m(x)))
  def predict(self, d_x):
    self.binary_predict(self, d_x)

  # g(x) = np.sign(sum(m=1, M, alpha_m * f_m(x)))
  def binary_predict(self, d_x):
    def calculate_output(x):
      return lambda output: output[0] * output[1].predict(x)

    pred_y = np.zeros((d_x.shape[0]))  
    dataset_x = np.float64(d_x)

    for model in self.models:
      pred_y += calculate_output(dataset_x)(model)

    return np.sign(pred_y)

In [None]:
dataset_ys = pd.get_dummies(train_y).astype('int8').replace(0, -1)

a = AdaBoost(50, DecisionTreeClassifier(max_depth=2))
a.fit(train_x, dataset_ys[3.0])
a.binary_predict(train_x)

sum(a.binary_predict(train_x) == dataset_ys[5.0])*100/train_y.size


61.29843872851848

### MultiBoost Classifier

I'm not sure how original this is haha since I came up with this on the spot, but anyway, this is a "meta" classifier that is made up of other classifiers, which in this case is a boosting classifier, but it can be used with other meta-classifiers as well like bagging I think?

(Note: Target dataset = dataset_y, not sure what's the correct term to use)

This classifier uses **One-Hot Encoding** to turn a multi-class target dataset (`dataset_y`) into a target dataset made up of several binary-class features. This is useful for boosting algorithms that require a binary-class target dataset like `LPBoost`, or in this case `AdaBoost` (note: AdaBoost can accept multi-class target datasets, but I have no idea how to implement the `predict` method for multi-class target datasets, so...). 

I want to use LPBoost, but I didn't have time. Just mention it on the *future things to do*/*Future Improvements* part that I planned on using **LPBoost** as it might be a better fit than **AdaBoost** for this.

Also, I already mentioned this, but this classifier works for other classifiers, not just AdaBoost.

#### Fit method
This method splits `dataset_y` into multiple binary `y datasets` and creates a AdaBoost model and fits it for every binary `y dataset`.

This method uses `deepcopy` since Python is a bit finnicky when it comes to assigning variables. It's a small Band-Aid I used to fix a bug that might take a long time to fix.

#### Merge_y method
This method is important, but as of now, for every $x$th row, it removes all the columns that contain `NaN` for the $x$ row, and chooses the maximum. It shouldn't use the maximum, but since I don't have time, that's what I used. What this method should actually do is to choose the class value (note: class value = y/target) that has the highest probability of being correct using the `predict_proba` method. I haven't implemented it to work that way though, but that's how the code should work just in case the row has more than one non-`NaN` value. You can add this part to **Future Improvements** as well.

#### Predict method
This method predicts the value by going through every AdaBoost model that would predict `dataset_x`. The output of every model is either $1$ or $-1$. This method converts that to the class value and `NaN` respectively (i.e. when the output is `1`, it turns into the class value. When the output is `-1` it turns into `NaN`). It then uses `merge_y` method to merge all the predicted $y$ values into one $y$ dataset.
It then returns the merged $y$ dataset.

In [None]:
class MultiBoostClassifier():
  def __init__(self, boost_model, boost_params):
    self.models = {}
    self.boost_model = boost_model
    self.boost_params = boost_params

  def fit(self, dataset_x, dataset_y):
    dataset_ys = pd.get_dummies(dataset_y).astype('int8').replace(0, -1)
    for class_val in dataset_ys:
      self.models[class_val] = deepcopy(self.boost_model(*self.boost_params)).fit(dataset_x, dataset_ys[class_val])


  def predict(self,dataset_x):
    # TODO, but no time: Instead of first, use the highest probability
    def merge_y(x):
      return x.dropna().astype(np.int8).max()

    result = {}
    for class_val in self.models:
      map_to_class_val = lambda x : class_val if x == 1 else np.nan
      map_to_class_val_vec = np.vectorize(map_to_class_val)
      result[class_val] = map_to_class_val_vec(self.models[class_val].binary_predict(dataset_x))    

    df_ys = pd.DataFrame(result)
    cols = df_ys.columns 
    df_ys['y'] = df_ys[cols].apply(merge_y, 1)
    return np.int8(df_ys.drop(cols, axis=1)['y'])

  def calculate_accuracy(self, actual_y: np.ndarray, predicted_y: np.ndarray):
    return 100*sum(actual_y == predicted_y)/predicted_y.size
    # return 100*sum(self.predict(dataset_x) == dataset_y)/dataset_y.size


In [None]:
multiboost = MultiBoostClassifier(
    AdaBoost,
    [150, DecisionTreeClassifier(max_depth=4)]
)
multiboost.fit(train_x, train_y)
sum(multiboost.predict(train_x) == train_y)*100/train_y.size

In [None]:
# multiboost = MultiBoostClassifier(
#     AdaBoost,
#     [50, DecisionTreeClassifier(max_depth=4)]
# )
# multiboost.fit(train_x, train_y)
# sum(multiboost.predict(train_x) == train_y)*100/train_y.size

In [None]:
sum(multiboost.predict(train_x) == train_y)*100/train_y.size

# Analysis

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, roc_curve

PALETTE = sns.color_palette("crest", as_cmap=True)
plt.style.use('dark_background')

def print_cm(actual_y, pred_y, title):
  heatmap_data = pd.crosstab(actual_y, pred_y, normalize="index").rename_axis(columns='Predicted Values', index='Actual Values')
  annot = pd.crosstab(actual_y, pred_y)
  _, ax = plt.subplots(figsize=(15,15))
  ax = sns.heatmap(
      heatmap_data,
      annot=annot,
      square=True,
      fmt='.2f',
      cmap=PALETTE,
      cbar_kws= 
        dict(
          use_gridspec=False,
          orientation='horizontal',
          shrink=0.7,
          pad=0.05
        ) # based on https://stackoverflow.com/a/47916308
      )

  acc_text = {
    'Accuracy': 100*sum(actual_y == pred_y)/pred_y.size,
    'F1-score (macro)': f1_score(actual_y, pred_y, average='macro'),
    'F1-score (micro)': f1_score(actual_y, pred_y, average='micro'),
    'F1-score (weighted)': f1_score(actual_y, pred_y, average='weighted'),
    'Precision (macro)': precision_score(actual_y, pred_y, average='macro'),
    'Precision (micro)': precision_score(actual_y, pred_y, average='micro'),
    'Precision (weighted)': precision_score(actual_y, pred_y, average='weighted'),
    'Recall (macro)': recall_score(actual_y, pred_y, average='macro'),
    'Recall (micro)': recall_score(actual_y, pred_y, average='micro'),
    'Recall (weighted)': recall_score(actual_y, pred_y, average='weighted'),
    'AUC Score (OvR, macro)': roc_auc_score(actual_y, pd.get_dummies(pred_y), average='macro', multi_class='ovr'),
    'AUC Score (OvO, macro)': roc_auc_score(actual_y, pd.get_dummies(pred_y), average='macro', multi_class='ovo'),
    'AUC Score (OvR, weighted)': roc_auc_score(actual_y, pd.get_dummies(pred_y), average='weighted', multi_class='ovr'),
    'AUC Score (OvO, weighted)': roc_auc_score(actual_y, pd.get_dummies(pred_y), average='weighted', multi_class='ovo')
  }

  text_arr = [f'{k}: {v}' for k, v in acc_text.items()] #] + [f'F1-score (class {i}): {val}' for i, val in enumerate(f1_score(actual_y, pred_y, average=None))]

  # based on https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.text.html
  ax.text(
      1/4,
      -0.25,
      '\n'.join(text_arr[:len(text_arr)//2]),
      horizontalalignment='center',
      verticalalignment='center',
      transform=ax.transAxes    
  ) 

  # based on https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.text.html
  ax.text(
      3/4,
      -0.25,
      '\n'.join(text_arr[len(text_arr)//2:]),
      horizontalalignment='center',
      verticalalignment='center',
      transform=ax.transAxes    
  )

  ax.set_title(title)

#  ax.set_xlabel('123')

In [None]:
print_cm(train_y, multiboost.predict(train_x), 'Training Set')

In [None]:
print_cm(test_y, multiboost.predict(test_x), 'Test Set')

In [None]:
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

# based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#plot-roc-curves-for-the-multiclass-problem
def get_roc_curve(actual_y, pred_y):  
  true_y = actual_y if isinstance(actual_y, pd.DataFrame) else pd.get_dummies(actual_y)
  test_y = pred_y if isinstance(pred_y, pd.DataFrame) else pd.get_dummies(pred_y)
  classes = np.sort(true_y.columns.unique())

  fpr = dict()
  tpr = dict()
  roc_auc = dict()

  for class_val in classes:
    fpr[class_val], tpr[class_val], _ = roc_curve(true_y.loc[:, class_val], test_y.loc[:, class_val])
    roc_auc[class_val] = auc(fpr[class_val], tpr[class_val])

  fpr['Micro-average'], tpr['Micro-average'], _ = roc_curve(true_y.values.ravel(), test_y.values.ravel())
  roc_auc['Micro-average'] = auc(fpr['Micro-average'], tpr['Micro-average'])
 
  all_fpr = np.unique(
      np.concatenate(
        [fpr[class_val] for class_val in classes]
      )
  )

  mean_tpr = np.zeros_like(all_fpr)

  for class_val in classes:
    mean_tpr += np.interp(all_fpr, fpr[class_val], tpr[class_val])

  # print(all_fpr)
  # print(fpr)
  # print(tpr)
  # print(fpr[0].dtype)

  mean_tpr /= classes.size

  fpr['Macro-average'] = all_fpr
  tpr['Macro-average'] = mean_tpr
  roc_auc['Macro-average'] = auc(fpr['Macro-average'], tpr['Macro-average'])

  df = pd.concat(
    [
      pd.DataFrame({
        'False positive rate': fpr[class_val], 
        'True positive rate': tpr[class_val], 
        # 'type': f'ROC curve of {class_val if isinstance(class_val, str) else CLASSES[int(class_val)]} (area: {roc_auc[class_val].toFixed(3)})'
        'type': f'{class_val if isinstance(class_val, str) else CLASSES[int(class_val)].title()} (area: {np.format_float_positional(roc_auc[class_val], precision=3, min_digits=3)})'
      }) for class_val in fpr
    ]
  ).reset_index(drop=True)

  a = sns.relplot(
      data=df,
      x='False positive rate',
      y='True positive rate',
      kind='line',
      hue='type',
      style='type',
      palette=sns.color_palette(
          palette='Pastel2',
          n_colors=df['type'].unique().size
      ),
      aspect=1.2
  )



get_roc_curve(test_y, multiboost.predict(test_x))


In [None]:
# Note: still working on this one, it's the same as above
# But rewritten
from scoping import scoping
from pandas.core.dtypes.missing import array_equals
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score

# based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#plot-roc-curves-for-the-multiclass-problem
def get_roc_curve(actual_y, pred_y):  
  true_y = actual_y if isinstance(actual_y, pd.DataFrame) else pd.get_dummies(actual_y)
  test_y = pred_y if isinstance(pred_y, pd.DataFrame) else pd.get_dummies(pred_y)
  classes = np.sort(true_y.columns.unique())

  def to_class_name(name, fpr, tpr):
    return f'{name} (area: {fix_float(auc(fpr, tpr), 3, 3)})'

  def fix_float(val, precision, min_digits):
    return np.format_float_positional(
      val,
      precision=precision,
      min_digits=min_digits
    )

  def get_roc_for_class(class_val):
    fpr, tpr, _ = roc_curve(true_y.loc[:, class_val], test_y.loc[:, class_val])
    auc_val = auc(fpr, tpr)
    return pd.DataFrame({
      'False positive rate': fpr.astype('float64'), 
      'True positive rate': tpr.astype('float64'), 
      'Class': to_class_name(CLASSES[int(class_val)].title(), fpr, tpr)
    })

  df = pd.concat(
    [get_roc_for_class(class_val) for class_val in classes]  
  ).reset_index(drop=True)
 
  all_fpr = np.unique(df['False positive rate'])

  with scoping():
    fpr, tpr, _ = roc_curve(true_y.values.ravel(), test_y.values.ravel())
    df = pd.concat([df, pd.DataFrame({
        'False positive rate': fpr.astype('float64'),
        'True positive rate': tpr.astype('float64'),
        'Class': f"Micro-average (area: {fix_float(auc(fpr, tpr), 3, 3)})"
    })]).reset_index(drop=True)

  def get_mean_tpr():
    mean_tpr = np.zeros_like(all_fpr)
    for class_val in classes:
      mean_tpr += np.interp(
          all_fpr,
          df['False positive rate'],
          df['True positive rate']
      )
    mean_tpr /= classes.size
    return mean_tpr

  df = pd.concat([df, pd.DataFrame({
      'False positive rate': all_fpr, 
      'True positive rate': get_mean_tpr(), 
      'Class': f"Macro-average (area: {fix_float(auc(all_fpr, get_mean_tpr()), 3, 3)})"
  })]).reset_index(drop=True)

  def plot():
    sns.relplot(
      data=df,
      x='False positive rate',
      y='True positive rate',
      kind='line',
      hue='Class',
      style='Class',
      palette=sns.color_palette(
          palette='Pastel2',
          n_colors=df['Class'].unique().size
      ),
      aspect=1.2
  )
  plot()



get_roc_curve(train_y, p)
