# Dependencies

In [None]:
import numpy as np
from numpy import mean
from numpy import std

In [None]:
import pandas as pd

In [None]:
import sklearn as sk
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

# Make Synthetic Data

In [None]:
X, Y = make_classification(
    n_samples = 2000, 
    n_features = 2, # number of x columns
    n_redundant = 0, 
    n_clusters_per_class = 2, 
    flip_y = 0.6, # add noise
    weights = [0.8], # set dataset imbalance
    class_sep = 1.1, # set class separation
    random_state = 1 
)

In [None]:
df = pd.concat([pd.DataFrame(X), pd.Series(Y)], axis=1) #concatenate to dataframe

In [None]:
df.columns = ['x1','x2','y'] # rename columns
#print(df) # unhashtag to check dataset

## Plot Dataset

In [None]:
def plot(df: pd.DataFrame, x1: str, x2: str, y: str, title: str = '', save: bool = False, figname='figure.png'):
    plt.figure(figsize=(14, 7))
    plt.scatter(x=df[df[y] == 0][x1], y=df[df[y] == 0][x2], label='y = 0')
    plt.scatter(x=df[df[y] == 1][x1], y=df[df[y] == 1][x2], label='y = 1')
    plt.title(title, fontsize=20)
    plt.legend()
    if save:
        plt.savefig(figname, dpi=300, bbox_inches='tight', pad_inches=0)
    plt.show()

In [None]:
plot(df=df, x1='x1', x2='x2', y='y')


# Supervised Models

## Group Models

In [None]:
X = df.drop(['y'],axis=1)
Y = df.y
clf1 = RandomForestClassifier(max_depth=2, random_state=1) # RF
clf2 = GradientBoostingClassifier(max_depth=2, random_state=1) # XGBoost
ypred1 = cross_val_predict(clf1, X, Y, cv=10)
ypred2 = cross_val_predict(clf2, X, Y, cv=10)
#print(classification_report(Y, ypred1)) # unhashtag to print RF classification report for dataframe
#print(classification_report(Y, ypred2)) # unhashtag to print XGBoost classification report for dataframe



## Bespoke Models

In [None]:
#split dataframe into 25 equal chunks
def split_dataframe_by_position(df, splits): 
    dataframes = []
    index_to_split = len(df) // splits
    start = 0
    end = index_to_split
    for split in range(splits):
        temp_df = df.iloc[start:end, :]
        dataframes.append(temp_df)
        start += index_to_split
        end += index_to_split
    return dataframes
split_dataframes = split_dataframe_by_position(df, 25)
#print(split_dataframes) # unhashtag to check split dataframes

In [None]:
for df in split_dataframes:
    #df = list_df[]
    X = df.drop(['y'],axis=1)
    Y = df.y
    clf1 = RandomForestClassifier(max_depth=2, random_state=1) # RF
    clf2 = GradientBoostingClassifier(max_depth=2, random_state=1) # XGBoost
    ypred1 = cross_val_predict(clf1, X, Y, cv=10)
    ypred2 = cross_val_predict(clf2, X, Y, cv=10)
    #print(classification_report(Y, ypred1)) # unhashtag to print RF classification report for each split dataframe
    #print(classification_report(Y, ypred2)) # unhashtag to print XGBoost classification report for each split dataframe


# Unsupervised Clustering

In [None]:
# calculate and concatenate PFIs for RF model of each split dataframe 
coef1 = []
for df in split_dataframes:
    X = df.drop(['y'],axis=1)
    Y = df.y
    clf1 = RandomForestClassifier(max_depth=2, random_state=1)
    clf1.fit(X, Y)
    result1 = permutation_importance(clf1, X, Y, n_repeats=10, random_state=1, n_jobs=2)
    feature_importances1 = pd.Series(result1.importances_mean, index=X.columns)
    coef1.append(feature_importances1)
coef1 = pd.concat(coef1,axis=1)

In [None]:
# calculate and concatenate PFIs for XGBoost model of each split dataframe 
coef2 = []
for df in split_dataframes:
    X = df.drop(['y'],axis=1)
    Y = df.y
    clf2 = GradientBoostingClassifier(max_depth=2, random_state=1)
    clf2.fit(X, Y)
    result2 = permutation_importance(clf2, X, Y, n_repeats=10, random_state=1, n_jobs=2)
    feature_importances2 = pd.Series(result2.importances_mean, index=X.columns)
    coef2.append(feature_importances2)
coef2 = pd.concat(coef2,axis=1)

In [None]:
# concatenate PFIs for all RF and XGBoost models
coef = pd.concat([coef1, coef2])
print(coef)

In [None]:
# plot clustermap using pairwise correlation as metric and ward as method
sns.clustermap(coef.corr(),yticklabels=True,xticklabels=True,method='ward', figsize=(7,7), cmap="coolwarm")
sns.set(font_scale=1.2)#dates removed 22h