In [None]:
# Magic Variables
NUM_CV = 100
classification = True
rate = True
low_thres = True #(True -> .1/yr, False --> .2/yr)
FILE_DIR = "example_dir/Data/Datasets"

# imports

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from imblearn.metrics import geometric_mean_score
from google.colab import drive

In [None]:
drive.mount('/content/drive')

In [None]:
if classification:
  if not rate:
    dataset_dir = os.path.join(FILE_DIR, "post_con_change_classification_2022-04-13 23:13:29.csv")
    y_col = 'Categorical Unidimensional Change'
  elif not low_thres:
    dataset_dir = os.path.join(FILE_DIR, "post_con_rate_classification_2022-04-13 23:13:29.csv")
    y_col = 'Categorical Unidimensional Rate'
  else:
    dataset_dir = os.path.join(FILE_DIR, "lower_thres_post_con_rate_classification_2022-04-13 23:14:55.csv")
    y_col = 'Categorical Unidimensional Rate'
else:
  if not rate:
    dataset_dir = os.path.join(FILE_DIR, "post_con_change_regression_2022-04-13 23:13:29.csv")
    y_col = 'Unidimensional Change'
  elif not low_thres:
    dataset_dir = os.path.join(FILE_DIR, "post_con_rate_regression_2022-04-13 23:13:29.csv")
    y_col = 'Unidimensional Rate'
  else:
    dataset_dir = os.path.join(FILE_DIR, "lower_thres_post_con_rate_regression_2022-04-13 23:14:55.csv")
    y_col = 'Unidimensional Rate'
print(y_col)

In [None]:
df = pd.read_csv(dataset_dir)
X = df.drop(labels=y_col,axis=1)
y = df[y_col]

In [None]:
print(len(y))

# Classifiers

In [None]:
least = y.value_counts().argmin()
majority_clf = DummyClassifier(strategy="most_frequent")
minority_clf = DummyClassifier(strategy="constant", constant=least)
random_clf = DummyClassifier(strategy="uniform")
stratified_clf =  DummyClassifier(strategy="stratified")

In [None]:
majority_clf.fit(X,y)
minority_clf.fit(X,y)
random_clf.fit(X,y)
stratified_clf.fit(X,y)

In [None]:
# accuracy = []
# gmean = []
# f1 = []

# If the dictionary is too complicated, change to separate lists
preds = {"majority": [], "minority": [], "random":[], "stratified":[]}
for i in range(NUM_CV):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
  
  preds["majority"].append((y_test,majority_clf.predict(X_test)))
  preds["minority"].append((y_test,minority_clf.predict(X_test)))
  preds["random"].append((y_test,random_clf.predict(X_test)))
  preds["stratified"].append((y_test,stratified_clf.predict(X_test)))

# Iterate over each dictionary, 

gmeans = {}
f1 = {}
accuracy = {}
for k,v in preds.items():
  gmeans[k] = sum([geometric_mean_score(pair[0],pair[1]) for pair in v])/NUM_CV
  f1[k] = sum([f1_score(pair[0],pair[1], zero_division=0) for pair in v])/NUM_CV
  accuracy[k] = sum([accuracy_score(pair[0],pair[1]) for pair in v])/NUM_CV

# gmeans = {k:geometric_mean_score(pair[0],pair[1]) for k,v in preds.items() for pair in v}
# f1 = {k:f1_score(pair[0],pair[1]) for k,v in preds.items() for pair in v}
# accuracy = {k:accuracy_score(pair[0],pair[1]) for k,v in preds.items() for pair in v}

print("F1: ",f1)
print("Accuracy: ",accuracy)
print("Gmeans: ",gmeans)

F1:  {'majority': 0.0, 'minority': 0.3746078283996168, 'random': 0.29585734194950214, 'stratified': 0.22299680562279825}
Accuracy:  {'majority': 0.7662500000000001, 'minority': 0.23375000000000015, 'random': 0.4860000000000001, 'stratified': 0.62325}
Gmeans:  {'majority': 0.0, 'minority': 0.0, 'random': 0.4746473602025884, 'stratified': 0.3939482664739108}
