<a href="https://colab.research.google.com/github/Hubert26/suicides_IPPAN/blob/main/ml_analize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Liblaries and settings

In [284]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import graphviz
import os

!pip install -U dtreeviz
import dtreeviz

from google.colab import files

#Ustawienie braku maksymalnej ilości wyświetlanych kolumn
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# avoid "Arial font not found warnings"
import logging
logging.getLogger('matplotlib.font_manager').setLevel(level=logging.CRITICAL)

!pip install eli5


np.random.seed(42)



#Function definitions

##thresholds settings

In [285]:
# Ustawienie progów dla wysokiego ryzyka
risk_thresholds = [0.5, 0.9, 0.95, 0.99]

##plot_confusion_matrix

In [286]:
def plot_confusion_matrix(cm, model):
    # klasyfikacja binarna
    cm = cm[::-1]
    cm = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_1', 'true_0'])

    fig = ff.create_annotated_heatmap(z=cm.values, x=list(cm.columns), y=list(cm.index),
                                      colorscale='ice', showscale=True, reversescale=True)
    fig.update_layout(
        width=500,
        height=500,
        title=model+' Confusion Matrix',
        font_size=16,
        template='plotly_dark'
        )
    fig.show()
    return fig

##plot_roc_curve

**pogrubiony tekst**
 Receiver Operating Characteristic
1. True Positive Rate (TPR) = FP / (FP+TN)
2. False Positive Rate (FPR) = TP / (TP+FN)

In [287]:
def plot_roc_curve(y_test, y_pred, model):
    # Binary classification
    from sklearn.metrics import roc_curve
    fpr, tpr, tresh = roc_curve(y_test, y_pred, pos_label=1)

    fig = go.Figure(
        data=[
            go.Scatter(x=fpr, y=tpr,
                    line_color='red',
                    name='ROC Curve'),
            go.Scatter(x=[0, 1], y=[0, 1],
                    mode='lines',
                    line_dash='dash',
                    line_color='#F012BE')
        ],
        layout=go.Layout(xaxis_title='False Positive Rate',
                         yaxis_title='True Positive Rate',
                         title = model + ' ROC Curve',
                         showlegend=False,
                         template='plotly_dark',
                         width=700))
    fig.show()
    return fig


##model_validation


Precision (precyzja) = TP / (TP+FP): Ile obserwacji przewidzianych jako pozytywne jest w rzeczywistości pozytywne.

Recall (czułość) = TP / (TP+FN): Ile obserwacji z wszystkich pozywtywnych sklasyfikowaliśmy jako pozytywne.

F1-score: ważona średnia harmonicza między precyzją a czułością;

Support (wsparcie): liczba próbek, które należą do każdej z klas;

Accuracy (dokładność) = (TP+TN) /(TP+TN+FP+FN))

FPR (False Positive Rate) = FP / (FP+TN) [type I error]

FNR (False Negative Rate) = FN / (FN+TP) [type II error]

In [288]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_fscore_support

def model_validation(model, X_test, y_test, risk_thresholds = [0.5, 0.9, 0.95, 0.99]):
    # Sprawdzenie, czy y_test i X_test nie są puste
    if y_test is None or X_test is None or len(y_test) == 0 or len(X_test) == 0:
        raise ValueError("y_test and X_test must not be empty.")

    # Sprawdzenie rozmiarów X_test i y_test
    if len(X_test) != len(y_test):
        raise ValueError(f"Inconsistent number of samples: X_test has {len(X_test)} samples, y_test has {len(y_test)} samples.")

    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    # Obliczenie miar jakości modelu
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    # Utworzenie ramki danych z wynikami
    model_results = pd.DataFrame({
        'recall': [recall],
        'accuracy': [accuracy],
        'precision': [precision],
        'f1': [f1],
    })

    # Obliczenie miar jakości modelu dla poszczególnych klas
    precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred, average=None, zero_division=0)

     # Tworzenie DataFrame z wynikami dla poszczególnych klas
    class_results = pd.DataFrame({
        'Class': np.unique(y_test),
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Support': support
    })

    # Dodanie wyników dla każdej klasy jako oddzielne kolumny
    for idx, class_ in enumerate(np.unique(y_test)):
        model_results[f'precision_{class_}'] = precision[idx]
        model_results[f'recall_{class_}'] = recall[idx]
        model_results[f'f1_{class_}'] = f1[idx]
        model_results[f'Support_{class_}'] = support[idx]

    for threshold in risk_thresholds:
        risk_metrics = risk_group_metrics(y_test, y_pred_prob, threshold)
        for key, value in risk_metrics.items():
            model_results[f'{key}_{threshold}'] = value

    return model_results

##bootstrap_auc

In [289]:
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample

def bootstrap_auc(y_true, y_pred_prob, n_bootstraps=1000, alpha=0.95):
    # Upewnienie się, że y_true i y_pred_prob są jednowymiarowymi tablicami
    y_true = np.asarray(y_true).ravel()
    y_pred_prob = np.asarray(y_pred_prob).ravel()

    rng = np.random.RandomState(seed=42)
    bootstrapped_aucs = []

    for i in range(n_bootstraps):
        # Przykladowanie z powtórzeniami
        indices = rng.randint(0, len(y_pred_prob), len(y_pred_prob))
        if len(np.unique(y_true[indices])) < 2:
            # Przeskakujemy przypadki, gdzie brakuje jednej z klas w próbie bootstrapowej
            continue

        score = roc_auc_score(y_true[indices], y_pred_prob[indices])
        bootstrapped_aucs.append(score)

    if len(bootstrapped_aucs) == 0:
        raise ValueError("Wszystkie bootstrapowe próbki były nieprawidłowe.")

    sorted_scores = np.array(bootstrapped_aucs)
    sorted_scores.sort()

    # Obliczanie przedziału ufności
    lower_bound = np.percentile(sorted_scores, (1 - alpha) / 2 * 100)
    upper_bound = np.percentile(sorted_scores, (1 + alpha) / 2 * 100)

    return lower_bound, upper_bound

##risk_group_metrics

In [290]:
def risk_group_metrics(y_test, y_pred_prob, threshold):
    high_risk = y_pred_prob >= threshold
    y_pred_high_risk = np.zeros_like(y_pred_prob)
    y_pred_high_risk[high_risk] = 1

    y_test_high_risk = y_test[high_risk]
    y_pred_prob_high_risk = y_pred_prob[high_risk]

    # Sprawdzamy, czy mamy odpowiednią liczbę próbek do utworzenia macierzy pomyłek
    if len(np.unique(y_test_high_risk)) < 2:
        # Jeśli mamy tylko jedną klasę w danych wysokiego ryzyka, to zwracamy wartości domyślne
        tn = fp = fn = tp = np.nan
        risk_ratio = low_risk_positive_rate = high_risk_positive_rate = specificity = roc_score = lower = upper = fpr = fnr = np.nan
    else:
        cm = confusion_matrix(y_test, y_pred_high_risk)
        tn, fp, fn, tp = cm.ravel()

        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        high_risk_positive_rate = tp / (tp + fp) if (tp + fp) > 0 else 0
        low_risk_positive_rate = fn / (fn + tn) if (fn + tn) > 0 else 0
        risk_ratio = high_risk_positive_rate / low_risk_positive_rate if (low_risk_positive_rate) > 0 else 0

        roc_score = roc_auc_score(y_test_high_risk, y_pred_prob_high_risk)
        lower, upper = bootstrap_auc(y_test_high_risk, y_pred_prob_high_risk, n_bootstraps=1000, alpha=0.95)

    return {
        'class_0_pred': (y_pred_high_risk == 0).sum(),
        'class_1_pred': (y_pred_high_risk == 1).sum(),
        'AUROC': roc_score,
        'AUROClow': lower,
        'AUROCup': upper,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'fpr': fpr,
        'fnr': fnr,
        'specificity': specificity,
        'high_risk_positive_rate': high_risk_positive_rate,
        'low_risk_positive_rate': low_risk_positive_rate,
        'risk_ratio': risk_ratio
    }

##plot_learning_curve

In [291]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

#Data exploration

In [292]:
df_raw = pd.read_csv('https://raw.githubusercontent.com/Hubert26/suicides_IPPAN/main/data/out_exploration_suicides.csv', delimiter=',', low_memory=False, index_col=False, dtype={'DateY': str, 'DateM': str,})


In [293]:
df_raw = df_raw.dropna(subset=['DateY'])

In [294]:
df_raw['DateY'].unique()

array(['2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023'], dtype=object)

In [295]:
df_raw['DateM'].unique()

array(['05', '06', '10', '11', '12', '03', '04', '07', '08', '09', '01',
       '02'], dtype=object)

In [296]:
df_raw.shape

(127034, 27)

In [297]:
df_raw.head(5)

Unnamed: 0,Income,Age1,Fatal,Method,DateM,Gender,Education,AbuseInfo,DateY,ID_samobójcy,WorkInfo,Substance,Age2,Date,Place,CountContext,Marital,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss
0,Dependent,07-12,0.0,Self-harm,5,F,Primary,Not,2013,11477120400,Student,Alco,00-18,5.2013,Forest,1.0,Single,0,0,0,0,0,0,0,0,1,0
1,Dependent,07-12,0.0,Other,6,F,Pre-primary,Not,2013,11481530100,Student,Sober,00-18,6.2013,Forest,1.0,Single,0,0,1,0,0,0,0,0,0,0
2,Dependent,07-12,1.0,Hanging,10,F,Pre-primary,Not,2013,11493674400,Student,Sober,00-18,10.2013,House,1.0,Single,0,0,0,0,0,0,0,0,0,0
3,Dependent,07-12,0.0,Other,10,M,Primary,Not,2013,11494487000,Student,Sober,00-18,10.2013,Other,1.0,Single,0,0,1,0,0,0,0,0,0,0
4,Dependent,07-12,1.0,Hanging,10,F,Pre-primary,Not,2013,11494823000,Student,Sober,00-18,10.2013,House,1.0,Single,0,0,0,0,0,0,0,0,0,0


##Filters and file name

['07-12',
 '13-18',
 '19-24',
 '25-29',
 '30-34',
 '35-39',
 '40-44',
 '45-49',
 '50-54',
 '55-59',
 '60-64',
 '65-69',
 '70-74',
 '75-79',
 '80-84',
 '85+']

 sorted(list(set(df_raw['Age1'])))

['00-18', '19-34', '35-64', '65+']

 sorted(list(set(df_raw['Age2'])))

['F', 'M']

[2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

sorted(list(set(df_raw['DateY'])))

In [298]:
df_raw['Age'] = df_raw['Age2']

In [299]:
age_group = ['00-18']
gender = ['F']
year = sorted(list(set(df_raw['DateY'])))

In [300]:
age_group

['00-18']

In [301]:
age_group_file_title = "age_group_[" + ''.join(filter(str.isdigit, age_group[0][:3])) + "-" + ''.join(filter(str.isdigit, age_group[-1][-3:])) + "]"
year_group_file_title = "year_[" + str(year[0]) + '-' + str(year[-1]) + "]"

In [302]:
file_title = age_group_file_title + '_' + "gender_" + str(gender) + "_" + year_group_file_title
file_title

"age_group_[00-18]_gender_['F']_year_[2013-2023]"

##Data and NaN exploration

In [303]:
df_data = df_raw[df_raw['Age2'].isin(age_group) & df_raw['Gender'].isin(gender) & df_raw['DateY'].isin(year)]

In [304]:
df_data.shape

(7361, 28)

In [305]:
df_data['Fatal'].value_counts()

Fatal
0.0    6907
1.0     454
Name: count, dtype: int64

In [306]:
##Brakujące dane
total = df_data.isnull().sum().sort_values(ascending=False)
percent = 100*(df_data.isnull().sum()/df_data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [307]:
missing_data.head(5)

Unnamed: 0,Total,Percent
Income,0,0.0
Age1,0,0.0
Context_HealthLoss,0,0.0
Context_Other,0,0.0
Context_MentalHealth,0,0.0


##Dropping columns

In [308]:
df_data.columns

Index(['Income', 'Age1', 'Fatal', 'Method', 'DateM', 'Gender', 'Education',
       'AbuseInfo', 'DateY', 'ID_samobójcy', 'WorkInfo', 'Substance', 'Age2',
       'Date', 'Place', 'CountContext', 'Marital', 'Context_Finances',
       'Context_CloseDeath', 'Context_FamilyConflict', 'Context_Disability',
       'Context_HeartBreak', 'Context_Crime', 'Context_SchoolWork',
       'Context_MentalHealth', 'Context_Other', 'Context_HealthLoss', 'Age'],
      dtype='object')

In [309]:
df_data.drop(['ID_samobójcy', 'Date', 'CountContext'], inplace=True,	 axis=1, errors='ignore')

In [310]:
df_data.drop(['DateM', 'DateY'], inplace=True,	 axis=1, errors='ignore')

In [311]:
df_data.drop(['Age1', 'Age2'], inplace=True,	 axis=1, errors='ignore')

In [312]:
df_data.drop(['Age', 'Gender'], inplace=True,	 axis=1, errors='ignore')

In [313]:
#df_data.drop(['Method'], inplace=True,	 axis=1, errors='ignore')

In [314]:
df_data.columns

Index(['Income', 'Fatal', 'Method', 'Education', 'AbuseInfo', 'WorkInfo',
       'Substance', 'Place', 'Marital', 'Context_Finances',
       'Context_CloseDeath', 'Context_FamilyConflict', 'Context_Disability',
       'Context_HeartBreak', 'Context_Crime', 'Context_SchoolWork',
       'Context_MentalHealth', 'Context_Other', 'Context_HealthLoss'],
      dtype='object')

##Dummies

In [315]:
data_dummies = pd.get_dummies(df_data)

In [316]:
data_dummies.shape

(7361, 56)

In [317]:
data_dummies = data_dummies.astype(np.uint8)

In [318]:
data_dummies.head()

Unnamed: 0,Fatal,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss,Income_Benefits,Income_Dependent,Income_NoSteady,Income_Steady,Method_Drowning,Method_Drugs,Method_Gas,Method_Hanging,Method_Jumping,Method_Other,Method_Poisoning,Method_Schooting,Method_Self-harm,Method_Vehicle,Education_Pre-primary,Education_Primary,Education_Secondary,Education_Vocational,AbuseInfo_Alco,AbuseInfo_Alco&OtherSub,AbuseInfo_Not,AbuseInfo_OtherSub,WorkInfo_Employed,WorkInfo_Student,WorkInfo_Unemployed,Substance_Alco,Substance_Alco&OtherSub,Substance_OtherSub,Substance_Sober,Place_Forest,Place_House,Place_Institution,Place_Isolation,Place_Other,Place_PoliceArmy,Place_Railway,Place_Road,Place_School,Place_UtilitySpaces,Place_WaterRes,Place_Work,Marital_Cohabitant,Marital_Married,Marital_Single,Marital_Single.1
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
7,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0


#DTC

In [319]:
Y = data_dummies['Fatal']
X = data_dummies.drop('Fatal', axis=1, errors='ignore')

##Cross-validation

In [320]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold

# Obliczenie wag klas
class_weights = compute_class_weight('balanced', classes=np.unique(Y), y=Y)
# Utworzenie słownika wag klas
class_weight_dict = dict(zip(np.unique(Y), class_weights))

dtc = DecisionTreeClassifier(max_depth=None, min_samples_split=10, min_samples_leaf=10)

# Definicja walidacji krzyżowej
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

all_results = pd.DataFrame()

# Przeprowadzenie walidacji krzyżowej
for fold, (train_index, test_index) in enumerate(skf.split(X, Y), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Trenowanie modelu na danych treningowych z wagami
    sample_weights = np.array([class_weight_dict[label] for label in y_train])
    dtc.fit(X_train, y_train, sample_weight=sample_weights)

    # Walidacja modelu na danych testowych
    model_results = model_validation(dtc, X_test, y_test, risk_thresholds)

    model_results['features'] = X_train.shape[1]
    model_results['train_size'] = len(y_train)
    model_results['class_0_train_size'] = (y_train == 0).sum()
    model_results['class_1_train_size'] = (y_train == 1).sum()
    model_results['test_size'] = len(y_test)
    model_results['class_weight_0'] = class_weight_dict[0]
    model_results['class_weight_1'] = class_weight_dict[1]
    model_results['fold'] = str(fold)

    # Dodanie wyników do ramki danych
    all_results = pd.concat([all_results, model_results], ignore_index=True)


In [321]:
all_results

Unnamed: 0,recall,accuracy,precision,f1,precision_0,recall_0,f1_0,Support_0,precision_1,recall_1,f1_1,Support_1,class_0_pred_0.5,class_1_pred_0.5,AUROC_0.5,AUROClow_0.5,AUROCup_0.5,tn_0.5,fp_0.5,fn_0.5,tp_0.5,fpr_0.5,fnr_0.5,specificity_0.5,high_risk_positive_rate_0.5,low_risk_positive_rate_0.5,risk_ratio_0.5,class_0_pred_0.9,class_1_pred_0.9,AUROC_0.9,AUROClow_0.9,AUROCup_0.9,tn_0.9,fp_0.9,fn_0.9,tp_0.9,fpr_0.9,fnr_0.9,specificity_0.9,high_risk_positive_rate_0.9,low_risk_positive_rate_0.9,risk_ratio_0.9,class_0_pred_0.95,class_1_pred_0.95,AUROC_0.95,AUROClow_0.95,AUROCup_0.95,tn_0.95,fp_0.95,fn_0.95,tp_0.95,fpr_0.95,fnr_0.95,specificity_0.95,high_risk_positive_rate_0.95,low_risk_positive_rate_0.95,risk_ratio_0.95,class_0_pred_0.99,class_1_pred_0.99,AUROC_0.99,AUROClow_0.99,AUROCup_0.99,tn_0.99,fp_0.99,fn_0.99,tp_0.99,fpr_0.99,fnr_0.99,specificity_0.99,high_risk_positive_rate_0.99,low_risk_positive_rate_0.99,risk_ratio_0.99,features,train_size,class_0_train_size,class_1_train_size,test_size,class_weight_0,class_weight_1,fold
0,0.883311,0.883311,0.945953,0.905408,0.987118,0.88712,0.934451,691,0.327586,0.826087,0.469136,46,621,116,0.837551,0.758157,0.910654,613,78,8,38,0.11288,0.173913,0.88712,0.327586,0.012882,25.428879,690,47,0.607843,0.415602,0.794069,674,17,16,30,0.024602,0.347826,0.975398,0.638298,0.023188,27.526596,708,29,0.443452,0.217938,0.681585,683,8,25,21,0.011577,0.543478,0.988423,0.724138,0.035311,20.507586,729,8,0.25,0.0,0.6,689.0,2.0,40.0,6.0,0.002894,0.869565,0.997106,0.75,0.05487,13.66875,55,6624,6216,408,737,0.532865,8.106828,1
1,0.849185,0.849185,0.947653,0.883115,0.991525,0.846599,0.913349,691,0.273973,0.888889,0.418848,45,590,146,0.875472,0.814131,0.930794,585,106,5,40,0.153401,0.111111,0.846599,0.273973,0.008475,32.328767,677,59,0.6375,0.477626,0.789884,667,24,10,35,0.034732,0.222222,0.965268,0.59322,0.014771,40.161017,695,41,0.587302,0.384589,0.777311,677,14,18,27,0.02026,0.4,0.97974,0.658537,0.025899,25.426829,735,1,,,,,,,,,,,,,,55,6625,6216,409,736,0.532865,8.106828,2
2,0.902174,0.902174,0.946227,0.918173,0.984351,0.910275,0.945865,691,0.360825,0.777778,0.492958,45,639,97,0.851843,0.76429,0.933246,629,62,10,35,0.089725,0.222222,0.910275,0.360825,0.015649,23.056701,697,39,0.771605,0.591049,0.927424,679,12,18,27,0.017366,0.4,0.982634,0.692308,0.025825,26.807692,712,24,0.752632,0.340341,0.99219,686,5,26,19,0.007236,0.577778,0.992764,0.791667,0.036517,21.679487,734,2,1.0,1.0,1.0,690.0,1.0,44.0,1.0,0.001447,0.977778,0.998553,0.5,0.059946,8.340909,55,6625,6216,409,736,0.532865,8.106828,3
3,0.862772,0.862772,0.940451,0.891001,0.983607,0.868307,0.922367,691,0.277778,0.777778,0.409357,45,610,126,0.711774,0.602688,0.805941,600,91,10,35,0.131693,0.222222,0.868307,0.277778,0.016393,16.944444,689,47,0.524545,0.332426,0.678736,666,25,23,22,0.036179,0.511111,0.963821,0.468085,0.033382,14.022202,712,24,0.688811,0.44783,0.899306,678,13,34,11,0.018813,0.755556,0.981187,0.458333,0.047753,9.598039,734,2,0.5,0.5,0.5,690.0,1.0,44.0,1.0,0.001447,0.977778,0.998553,0.5,0.059946,8.340909,55,6625,6216,409,736,0.532865,8.106828,4
4,0.881793,0.881793,0.94292,0.904005,0.983974,0.888567,0.93384,691,0.3125,0.777778,0.44586,45,624,112,0.824119,0.741228,0.896728,614,77,10,35,0.111433,0.222222,0.888567,0.3125,0.016026,19.5,688,48,0.588022,0.407883,0.747485,672,19,16,29,0.027496,0.355556,0.972504,0.604167,0.023256,25.979167,709,27,0.506173,0.286307,0.745568,682,9,27,18,0.013025,0.6,0.986975,0.666667,0.038082,17.506173,734,2,1.0,1.0,1.0,690.0,1.0,44.0,1.0,0.001447,0.977778,0.998553,0.5,0.059946,8.340909,55,6625,6216,409,736,0.532865,8.106828,5
5,0.899457,0.899457,0.955434,0.918204,0.9936,0.898698,0.943769,691,0.369369,0.911111,0.525641,45,625,111,0.828223,0.732695,0.899853,621,70,4,41,0.101302,0.088889,0.898698,0.369369,0.0064,57.713964,690,46,0.798174,0.65161,0.922923,674,17,16,29,0.024602,0.355556,0.975398,0.630435,0.023188,27.1875,705,31,0.83631,0.606366,0.992351,684,7,21,24,0.01013,0.466667,0.98987,0.774194,0.029787,25.990783,734,2,,,,,,,,,,,,,,55,6625,6216,409,736,0.532865,8.106828,6
6,0.860054,0.860054,0.948731,0.890566,0.991639,0.858177,0.920093,691,0.289855,0.888889,0.437158,45,598,138,0.85523,0.78756,0.914304,593,98,5,40,0.141823,0.111111,0.858177,0.289855,0.008361,34.666667,673,63,0.665314,0.530064,0.792877,662,29,11,34,0.041968,0.244444,0.958032,0.539683,0.016345,33.018759,704,32,0.652273,0.419206,0.865393,681,10,23,22,0.014472,0.511111,0.985528,0.6875,0.03267,21.043478,734,2,0.0,0.0,0.0,690.0,1.0,44.0,1.0,0.001447,0.977778,0.998553,0.5,0.059946,8.340909,55,6625,6216,409,736,0.532865,8.106828,7
7,0.89538,0.89538,0.952653,0.914754,0.991974,0.895652,0.941356,690,0.362832,0.891304,0.515723,46,623,113,0.804201,0.71797,0.882981,618,72,5,41,0.104348,0.108696,0.895652,0.362832,0.008026,45.20885,689,47,0.478431,0.309191,0.648237,673,17,16,30,0.024638,0.347826,0.975362,0.638298,0.023222,27.486702,702,34,0.678947,0.471886,0.849829,675,15,27,19,0.021739,0.586957,0.978261,0.558824,0.038462,14.529412,731,5,0.875,0.625,1.0,689.0,1.0,42.0,4.0,0.001449,0.913043,0.998551,0.8,0.057456,13.92381,55,6625,6217,408,736,0.532865,8.106828,8
8,0.891304,0.891304,0.953678,0.912173,0.993528,0.889855,0.938838,690,0.355932,0.913043,0.512195,46,618,118,0.780075,0.690181,0.868571,614,76,4,42,0.110145,0.086957,0.889855,0.355932,0.006472,54.991525,683,53,0.647101,0.490704,0.789487,667,23,16,30,0.033333,0.347826,0.966667,0.566038,0.023426,24.162736,698,38,0.66087,0.470542,0.845253,675,15,23,23,0.021739,0.5,0.978261,0.605263,0.032951,18.368421,730,6,0.5,0.1,0.9,688.0,2.0,42.0,4.0,0.002899,0.913043,0.997101,0.666667,0.057534,11.587302,55,6625,6217,408,736,0.532865,8.106828,9
9,0.872283,0.872283,0.942713,0.897449,0.985342,0.876812,0.927914,690,0.303279,0.804348,0.440476,46,614,122,0.790938,0.706064,0.869025,605,85,9,37,0.123188,0.195652,0.876812,0.303279,0.014658,20.690346,682,54,0.624143,0.468925,0.768102,663,27,19,27,0.03913,0.413043,0.96087,0.5,0.027859,17.947368,707,29,0.44697,0.231534,0.69481,679,11,28,18,0.015942,0.608696,0.984058,0.62069,0.039604,15.672414,730,6,0.222222,0.0,0.75,687.0,3.0,43.0,3.0,0.004348,0.934783,0.995652,0.5,0.058904,8.488372,55,6625,6217,408,736,0.532865,8.106828,10


In [322]:
# Obliczenie średnich dla każdej kolumny
mean_values = all_results.select_dtypes(include=np.number).mean()
# Dodanie średnich jako nowego wiersza
mean_values['fold'] = 'mean'  # Oznaczenie wiersza ze średnimi

all_results = pd.concat([all_results, pd.DataFrame(mean_values).T], ignore_index=True)

##Decision Tree Classifier

In [323]:
# Ocena modelu DTC na całym zbiorze danych
dtc.fit(X, Y, sample_weight = np.array([class_weight_dict[label] for label in Y]))
dtc_results = model_validation(dtc, X, Y, risk_thresholds)

In [324]:
dtc_results['fold'] = 'final'
dtc_results['features'] = X.shape[1]
dtc_results['train_size'] = len(Y)
dtc_results['class_0_train_size'] = (Y == 0).sum()
dtc_results['class_1_train_size'] = (Y == 1).sum()
dtc_results['test_size'] = len(Y)
dtc_results['class_weight_0'] = class_weight_dict[0]
dtc_results['class_weight_1'] = class_weight_dict[1]

dtc_all_results = pd.concat([all_results, dtc_results], ignore_index=True)

##Feature validation

###Mean Decrease Impurity

In [325]:
importances = dtc.feature_importances_

# Sortowanie cech według ważności
sorted_indices = importances.argsort()[::-1]
data = {'feature': [X.columns[index] for index in sorted_indices],
        'Mean_Decrease_Impurity': [importances[index] for index in sorted_indices]}

importances_df = pd.DataFrame(data)

###Permutation Importance


In [326]:
import eli5
from eli5.sklearn import PermutationImportance

# Obliczenie Permutation Importance
perm = PermutationImportance(dtc, random_state=42).fit(X, Y)

# Pobranie wyników jako DataFrame
perm_df = eli5.explain_weights_df(perm, feature_names=list(X.columns))
# Zmiana nazw kolumn
perm_df = perm_df.rename(columns={'weight': 'Permutation_Importance_weight', 'std': 'Permutation_Importance_std'})

perm_df.drop(['Permutation_Importance_std'], inplace=True,	 axis=1, errors='ignore')

###Mean Decrease Accuracy

In [327]:
# Obliczenie Mean Decrease Accuracy za pomocą cross_val_score
def mean_decrease_accuracy(model, X, y):
    baseline_accuracy = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

    feature_importances = {}
    for feature in X.columns:
        X_permuted = X.copy()
        X_permuted[feature] = np.random.permutation(X[feature].values)
        permuted_accuracy = cross_val_score(model, X_permuted, y, cv=5, scoring='accuracy').mean()
        feature_importances[feature] = baseline_accuracy - permuted_accuracy

    return feature_importances

# Obliczenie Mean Decrease Accuracy dla cech
mda_results = mean_decrease_accuracy(dtc, X, Y)


In [328]:
mda_df = pd.DataFrame(list(mda_results.items()), columns=['feature', 'Mean_Decrease_Accuracy'])

In [329]:
merged_df = importances_df.merge(mda_df, on='feature', how='left')
dtc_feature_validation_df = merged_df.merge(perm_df, on='feature', how='left')

##Nodes info

In [361]:
# Pobranie atrybutów drzewa
tree = dtc.tree_

# Pobranie ścieżki decyzji dla każdej próbki
node_indicator = dtc.decision_path(X_train)
leaf_indices = dtc.apply(X_train)

# Funkcja do obliczania głębokości węzłów
def compute_node_depths(tree):
    def get_depth(node_id, current_depth):
        depths[node_id] = current_depth
        if tree.children_left[node_id] != -1:
            get_depth(tree.children_left[node_id], current_depth + 1)
        if tree.children_right[node_id] != -1:
            get_depth(tree.children_right[node_id], current_depth + 1)

    depths = np.zeros(tree.node_count, dtype=np.int32)
    get_depth(0, 0)
    return depths

from sklearn.tree import _tree
def get_feature_name(tree, node_id, feature_names):
    """
    Zwraca nazwę cechy użytej do podziału w węźle.
    Jeśli węzeł jest liściem, zwraca 'Leaf'.
    """
    if tree.feature[node_id] != _tree.TREE_UNDEFINED:
        feature_index = tree.feature[node_id]
        return feature_names[feature_index]
    else:
        return 'Leaf'

# Obliczenie głębokości węzłów
node_depths = compute_node_depths(tree)

# Lista do przechowywania informacji o węzłach
nodes_info = []


# Iterowanie przez węzły
for node_id in range(tree.node_count):
    # Pobranie wskaźnika Gini
    gini = tree.impurity[node_id]

    # Pobranie liczby próbek w węźle
    samples = tree.n_node_samples[node_id]

    # Pobranie wartości dla każdej klasy
    values = tree.value[node_id][0]

    # Pobranie głębokości węzła
    depth = node_depths[node_id]

    # Sprawdzenie, czy węzeł jest liściem
    is_leaf = (tree.children_left[node_id] == -1) and (tree.children_right[node_id] == -1)

    # Pobranie cechy użytej do podziału (jeśli węzeł nie jest liściem)
    feature = get_feature_name(tree, node_id, X.columns)

    # Pobranie progu podziału (jeśli węzeł nie jest liściem)
    threshold = tree.threshold[node_id] if not is_leaf else -1

    # Utworzenie słownika z informacjami o węźle
    node_info = {
        'Node ID': node_id,
        'Gini': gini,
        'Samples': samples,
        'Values (Weighted)': values,
        'Sum of Values': sum(values),
        'Class Counts (Unweighted)': np.round(values / class_weights),
        'Depth': depth,
        'Is Leaf': is_leaf,
        'Feature': feature,
        'Threshold': threshold,
        'Children Left': tree.children_left[node_id],
        'Children Right': tree.children_right[node_id]
    }

    # Dodanie słownika do listy
    nodes_info.append(node_info)

# Utworzenie DataFrame z listy słowników
dtc_nodes_info_df = pd.DataFrame(nodes_info)

In [362]:
dtc_nodes_info_df.head(5)

Unnamed: 0,Node ID,Gini,Samples,Values (Weighted),Sum of Values,Class Counts (Unweighted),Depth,Is Leaf,Feature,Threshold,Children Left,Children Right
0,0,0.5,7361,"[3680.499999999782, 3680.5000000000455]",7361.0,"[6907.0, 454.0]",0,False,Method_Hanging,0.5,1,258
1,1,0.406172,6828,"[3546.217967279363, 1402.4812775330397]",4948.699245,"[6655.0, 173.0]",1,False,Place_Railway,0.5,2,239
2,2,0.334278,6670,"[3492.398581149354, 940.3920704845815]",4432.790652,"[6554.0, 116.0]",2,False,Method_Jumping,0.5,3,194
3,3,0.245553,6252,"[3295.2384537423895, 551.2643171806168]",3846.502771,"[6184.0, 68.0]",3,False,Method_Poisoning,0.5,4,189
4,4,0.225323,6201,"[3272.325249746445, 486.40969162995594]",3758.734941,"[6141.0, 60.0]",4,False,Method_Other,0.5,5,160


##Path to Leaf info

In [332]:
# Funkcja do uzyskania ścieżek do każdego liścia
def get_leaf_paths(tree, feature_names=None):
    tree_ = tree.tree_
    feature = tree_.feature
    threshold = tree_.threshold
    children_left = tree_.children_left
    children_right = tree_.children_right
    impurity = tree_.impurity
    n_node_samples = tree_.n_node_samples
    value = tree_.value

    leaf_info = []

    def recurse(node, path_conditions):
        if children_left[node] == children_right[node]:  # Jeśli jest to liść
            class_counts = value[node][0]
            leaf_class = np.argmax(class_counts)
            leaf_details = {
                'node_id': node,
                'gini': impurity[node],
                'samples': n_node_samples[node],
                '(1-gini)*samples': (1-impurity[node]) * n_node_samples[node],
                'leaf_class': leaf_class
            }
            leaf_details.update(path_conditions)
            leaf_info.append(leaf_details)
        else:
            if feature_names is not None:
                feature_name = feature_names[feature[node]]
            else:
                feature_name = feature[node]

            left_path_conditions = path_conditions.copy()
            left_path_conditions[feature_name] = 0
            recurse(children_left[node], left_path_conditions)

            right_path_conditions = path_conditions.copy()
            right_path_conditions[feature_name] = 1
            recurse(children_right[node], right_path_conditions)

    initial_conditions = {feature: np.nan for feature in feature_names}
    recurse(0, initial_conditions)

    return pd.DataFrame(leaf_info)

In [333]:
# Uzyskanie nazw cech
feature_names = X_train.columns

# Uzyskanie ścieżek do każdego liścia
leaf_info_df = get_leaf_paths(dtc, feature_names)

In [334]:
leaf_info_df.head(5)

Unnamed: 0,node_id,gini,samples,(1-gini)*samples,leaf_class,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss,Income_Benefits,Income_Dependent,Income_NoSteady,Income_Steady,Method_Drowning,Method_Drugs,Method_Gas,Method_Hanging,Method_Jumping,Method_Other,Method_Poisoning,Method_Schooting,Method_Self-harm,Method_Vehicle,Education_Pre-primary,Education_Primary,Education_Secondary,Education_Vocational,AbuseInfo_Alco,AbuseInfo_Alco&OtherSub,AbuseInfo_Not,AbuseInfo_OtherSub,WorkInfo_Employed,WorkInfo_Student,WorkInfo_Unemployed,Substance_Alco,Substance_Alco&OtherSub,Substance_OtherSub,Substance_Sober,Place_Forest,Place_House,Place_Institution,Place_Isolation,Place_Other,Place_PoliceArmy,Place_Railway,Place_Road,Place_School,Place_UtilitySpaces,Place_WaterRes,Place_Work,Marital_Cohabitant,Marital_Married,Marital_Single,Marital_Single.1
0,9,0.3796867,31,19.229713,1,,,,,,,,,,,,,,,0.0,0.0,,0,0.0,0.0,0.0,,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,
1,10,-1.088019e-14,14,14.0,0,,,,,,,,,,,,,,,0.0,0.0,,0,0.0,0.0,0.0,,0.0,,1.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,
2,22,0.0,87,87.0,0,,,0.0,,0.0,,0.0,,,,,,0.0,,0.0,1.0,,0,0.0,0.0,0.0,,0.0,,0.0,,,,,,,,,,0.0,,,0.0,,0.0,0.0,,,,,0.0,0.0,,,,,,,,0.0
3,27,0.3904033,43,26.212658,0,,,0.0,,0.0,,0.0,0.0,0.0,,,,0.0,,0.0,1.0,,0,0.0,0.0,0.0,,0.0,,0.0,0.0,,,,,,,,,0.0,0.0,,0.0,,0.0,1.0,,,,,0.0,0.0,,,,,,,,0.0
4,28,0.4416565,96,53.600976,0,,,0.0,,0.0,,0.0,0.0,0.0,,,,0.0,,0.0,1.0,,0,0.0,0.0,0.0,,0.0,,0.0,1.0,,,,,,,,,0.0,0.0,,0.0,,0.0,1.0,,,,,0.0,0.0,,,,,,,,0.0


## Decission Tree graph

###graphviz

In [359]:
# Eksportowanie drzewa do pliku DOT
export_graphviz(dtc, out_file='TREE_graphviz.dot', feature_names=X.columns, filled=True, rounded=True, class_names={0:'NotFatal',1:'Fatal'}, special_characters=True, impurity=True, node_ids=True, rotate=True)

# Konwersja pliku DOT na obraz
with open('TREE_graphviz.dot') as f:
    dot_graph = f.read()

#RFC

In [336]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight

# Definicja siatki parametrów do RandomForestClassifier (opcjonalnie)
param_grid = {
    'n_estimators': 100,
    'max_features': 'sqrt',
    'max_depth': None,
    'min_samples_split': 10,
    'min_samples_leaf': 10
}

##Cross-validation

In [337]:
# Stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=2)
all_models = []
class_weights_all = []
all_results = pd.DataFrame()

for fold, (train_index, test_index) in enumerate(skf.split(X, Y), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Oblicz wagi klas
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}
    class_weights_all.append(class_weights_dict)

    # Inicjalizacja modelu RandomForestClassifier
    rf = RandomForestClassifier(random_state=42, **param_grid, class_weight=class_weights_dict)

    # Trenowanie modelu na zestawie treningowym
    rf.fit(X_train, y_train)

    # Dodanie do listy wytrenowanych modeli
    all_models.append(rf)

    # Walidacja modelu na danych testowych
    model_results = model_validation(rf, X_test, y_test, risk_thresholds)

    model_results['features'] = X_train.shape[1]
    model_results['train_size'] = len(y_train)
    model_results['class_0_train_size'] = (y_train == 0).sum()
    model_results['class_1_train_size'] = (y_train == 1).sum()
    model_results['test_size'] = len(y_test)
    model_results['class_weight_0'] = class_weight_dict[0]
    model_results['class_weight_1'] = class_weight_dict[1]
    model_results['fold'] = str(fold)

    # Dodanie wyników do ramki danych
    all_results = pd.concat([all_results, model_results], ignore_index=True)

In [338]:
# Obliczenie średnich dla każdej kolumny
mean_values = all_results.select_dtypes(include=np.number).mean()
# Dodanie średnich jako nowego wiersza
mean_values['fold'] = 'mean'  # Oznaczenie wiersza ze średnimi

all_results = pd.concat([all_results, pd.DataFrame(mean_values).T], ignore_index=True)

In [339]:
all_results

Unnamed: 0,recall,accuracy,precision,f1,precision_0,recall_0,f1_0,Support_0,precision_1,recall_1,f1_1,Support_1,class_0_pred_0.5,class_1_pred_0.5,AUROC_0.5,AUROClow_0.5,AUROCup_0.5,tn_0.5,fp_0.5,fn_0.5,tp_0.5,fpr_0.5,fnr_0.5,specificity_0.5,high_risk_positive_rate_0.5,low_risk_positive_rate_0.5,risk_ratio_0.5,class_0_pred_0.9,class_1_pred_0.9,AUROC_0.9,AUROClow_0.9,AUROCup_0.9,tn_0.9,fp_0.9,fn_0.9,tp_0.9,fpr_0.9,fnr_0.9,specificity_0.9,high_risk_positive_rate_0.9,low_risk_positive_rate_0.9,risk_ratio_0.9,class_0_pred_0.95,class_1_pred_0.95,AUROC_0.95,AUROClow_0.95,AUROCup_0.95,tn_0.95,fp_0.95,fn_0.95,tp_0.95,fpr_0.95,fnr_0.95,specificity_0.95,high_risk_positive_rate_0.95,low_risk_positive_rate_0.95,risk_ratio_0.95,class_0_pred_0.99,class_1_pred_0.99,AUROC_0.99,AUROClow_0.99,AUROCup_0.99,tn_0.99,fp_0.99,fn_0.99,tp_0.99,fpr_0.99,fnr_0.99,specificity_0.99,high_risk_positive_rate_0.99,low_risk_positive_rate_0.99,risk_ratio_0.99,features,train_size,class_0_train_size,class_1_train_size,test_size,class_weight_0,class_weight_1,fold
0,0.873404,0.873404,0.949984,0.899615,0.991771,0.872322,0.928219,3454.0,0.314152,0.889868,0.464368,227.0,3038.0,643.0,0.787314,0.75011,0.82419,3013.0,441.0,25.0,202.0,0.127678,0.110132,0.872322,0.314152,0.008229,38.175801,3617.0,64.0,0.456821,0.312214,0.620323,3437.0,17.0,180.0,47.0,0.004922,0.792952,0.995078,0.734375,0.049765,14.756858,3681.0,0.0,,,,,,,,,,,,,,3681.0,0.0,,,,,,,,,,,,,,55.0,3680.0,3453.0,227.0,3681.0,0.532865,8.106828,1
1,0.934511,0.934511,0.953538,0.941514,0.984611,0.944975,0.964386,3453.0,0.480874,0.77533,0.593592,227.0,3314.0,366.0,0.762336,0.713241,0.811501,3263.0,190.0,51.0,176.0,0.055025,0.22467,0.944975,0.480874,0.015389,31.247402,3634.0,46.0,0.465465,0.278181,0.643951,3444.0,9.0,190.0,37.0,0.002606,0.837004,0.997394,0.804348,0.052284,15.384211,3680.0,0.0,,,,,,,,,,,,,,3680.0,0.0,,,,,,,,,,,,,,55.0,3681.0,3454.0,227.0,3680.0,0.532865,8.106828,2
2,0.903957,0.903957,0.951761,0.920564,0.988191,0.908649,0.946303,3453.5,0.397513,0.832599,0.52898,227.0,3176.0,504.5,0.774825,0.731675,0.817846,3138.0,315.5,38.0,189.0,0.091351,0.167401,0.908649,0.397513,0.011809,34.711601,3625.5,55.0,0.461143,0.295198,0.632137,3440.5,13.0,185.0,42.0,0.003764,0.814978,0.996236,0.769361,0.051024,15.070534,3680.5,0.0,,,,,,,,,,,,,,3680.5,0.0,,,,,,,,,,,,,,55.0,3680.5,3453.5,227.0,3680.5,0.532865,8.106828,mean


##Random Forest Classifier

In [340]:
# Uśrednienie wag klas
avg_class_weights = {}
for class_id in range(len(np.unique(Y))):
    avg_weight = np.mean([weights[class_id] for weights in class_weights_all])
    avg_class_weights[class_id] = avg_weight

# Połączenie wszystkich modeli w jeden model
final_rf = RandomForestClassifier(random_state=42, **param_grid, class_weight=avg_class_weights)

# Łączenie drzew z poszczególnych modeli
combined_estimators = []
for model in all_models:
    for tree in model.estimators_:
        combined_estimators.append(tree)

final_rf.estimators_ = combined_estimators

# Ostateczne dopasowanie modelu do całego zestawu danych
final_rf.fit(X, Y)

rf_results = model_validation(rf, X, Y, risk_thresholds)

In [341]:
rf_results['fold'] = 'final'
rf_results['features'] = X.shape[1]
rf_results['train_size'] = len(Y)
rf_results['class_0_train_size'] = (Y == 0).sum()
rf_results['class_1_train_size'] = (Y == 1).sum()
rf_results['test_size'] = len(Y)
rf_results['class_0_weight'] = avg_class_weights[0]
rf_results['class_1_weight'] = avg_class_weights[1]

rf_all_results = pd.concat([all_results, rf_results], ignore_index=True)

##Feature validation

###Mean Decrease Impurity

In [342]:
importances = rf.feature_importances_

# Sortowanie cech według ważności
sorted_indices = importances.argsort()[::-1]
data = {'feature': [X.columns[index] for index in sorted_indices],
        'Mean_Decrease_Impurity': [importances[index] for index in sorted_indices]}

importances_df = pd.DataFrame(data)

###Permutation Importance


In [343]:
import eli5
from eli5.sklearn import PermutationImportance

# Obliczenie Permutation Importance
perm = PermutationImportance(dtc, random_state=42).fit(X, Y)

# Pobranie wyników jako DataFrame
perm_df = eli5.explain_weights_df(perm, feature_names=list(X.columns))
# Zmiana nazw kolumn
perm_df = perm_df.rename(columns={'weight': 'Permutation_Importance_weight', 'std': 'Permutation_Importance_std'})

perm_df.drop(['Permutation_Importance_std'], inplace=True,	 axis=1, errors='ignore')

###Mean Decrease Accuracy

In [344]:
# Obliczenie Mean Decrease Accuracy dla cech
mda_results = mean_decrease_accuracy(rf, X, Y)


In [345]:
mda_df = pd.DataFrame(list(mda_results.items()), columns=['feature', 'Mean_Decrease_Accuracy'])

In [346]:
merged_df = importances_df.merge(mda_df, on='feature', how='left')
rf_feature_validation_df = merged_df.merge(perm_df, on='feature', how='left')

In [347]:
X.columns

Index(['Context_Finances', 'Context_CloseDeath', 'Context_FamilyConflict',
       'Context_Disability', 'Context_HeartBreak', 'Context_Crime',
       'Context_SchoolWork', 'Context_MentalHealth', 'Context_Other',
       'Context_HealthLoss', 'Income_Benefits', 'Income_Dependent',
       'Income_NoSteady', 'Income_Steady', 'Method_Drowning', 'Method_Drugs',
       'Method_Gas', 'Method_Hanging', 'Method_Jumping', 'Method_Other',
       'Method_Poisoning', 'Method_Schooting', 'Method_Self-harm',
       'Method_Vehicle', 'Education_Pre-primary', 'Education_Primary',
       'Education_Secondary', 'Education_Vocational', 'AbuseInfo_Alco',
       'AbuseInfo_Alco&OtherSub', 'AbuseInfo_Not', 'AbuseInfo_OtherSub',
       'WorkInfo_Employed', 'WorkInfo_Student', 'WorkInfo_Unemployed',
       'Substance_Alco', 'Substance_Alco&OtherSub', 'Substance_OtherSub',
       'Substance_Sober', 'Place_Forest', 'Place_House', 'Place_Institution',
       'Place_Isolation', 'Place_Other', 'Place_PoliceArmy', '

#Feature Count in risk groups

In [348]:
# Utworzenie pustego DataFrame do przechowywania wyników
feature_count_df = pd.DataFrame()

# Przewidywanie prawdopodobieństw
pred_prob = rf.predict_proba(X)[:, 1]

for threshold in risk_thresholds:
    # Wybranie przykładów o wysokim ryzyku
    high_risk_idx = pred_prob >= threshold
    high_risk_X = X[high_risk_idx]

    if not high_risk_X.empty:
        # Obliczenie wystąpienia każdej cechy w grupach wysokiego ryzyka
        high_risk_features_sum = high_risk_X.sum()

        # Dodanie kolumny threshold do wyników
        high_risk_features_sum['threshold'] = threshold

        high_risk_features_sum['samples'] = high_risk_X.shape[0]

        # Dodanie wyników do feature_count_df
        feature_count_df = pd.concat([feature_count_df, high_risk_features_sum.to_frame().T], ignore_index=True)
    else:
        print(f"Brak przykładów o wysokim ryzyku dla progu {threshold}")



Brak przykładów o wysokim ryzyku dla progu 0.95
Brak przykładów o wysokim ryzyku dla progu 0.99


In [349]:
feature_count_df

Unnamed: 0,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss,Income_Benefits,Income_Dependent,Income_NoSteady,Income_Steady,Method_Drowning,Method_Drugs,Method_Gas,Method_Hanging,Method_Jumping,Method_Other,Method_Poisoning,Method_Schooting,Method_Self-harm,Method_Vehicle,Education_Pre-primary,Education_Primary,Education_Secondary,Education_Vocational,AbuseInfo_Alco,AbuseInfo_Alco&OtherSub,AbuseInfo_Not,AbuseInfo_OtherSub,WorkInfo_Employed,WorkInfo_Student,WorkInfo_Unemployed,Substance_Alco,Substance_Alco&OtherSub,Substance_OtherSub,Substance_Sober,Place_Forest,Place_House,Place_Institution,Place_Isolation,Place_Other,Place_PoliceArmy,Place_Railway,Place_Road,Place_School,Place_UtilitySpaces,Place_WaterRes,Place_Work,Marital_Cohabitant,Marital_Married,Marital_Single,Marital_Single.1,threshold,samples
0,1.0,8.0,71.0,0.0,68.0,0.0,46.0,212.0,74.0,2.0,1.0,830.0,35.0,5.0,8.0,0.0,1.0,533.0,160.0,49.0,7.0,1.0,0.0,112.0,140.0,576.0,150.0,5.0,5.0,1.0,853.0,12.0,4.0,851.0,16.0,131.0,10.0,240.0,490.0,93.0,387.0,44.0,2.0,118.0,0.0,134.0,30.0,9.0,37.0,16.0,1.0,0.0,4.0,713.0,154.0,0.5,871.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,117.0,0.0,2.0,0.0,0.0,0.0,119.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,6.0,0.0,0.0,0.0,118.0,1.0,2.0,117.0,0.0,16.0,0.0,46.0,57.0,24.0,77.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,3.0,102.0,14.0,0.9,119.0


#Leaf

In [350]:
# Wypełnienie brakujących wartości (nan) średnią wartością cechy z X_train
suicidal_profiles = leaf_info_df.copy()
suicidal_profiles = suicidal_profiles[feature_names]

# Iteracja po każdym wierszu suicidal_profiles
for index, profile in suicidal_profiles.iterrows():
    # Wybór kolumn, które nie mają NaN
    valid_columns = profile.dropna().index
    profile_values = profile.dropna().values

    # Filtrowanie X, aby zawierał tylko obserwacje z takimi samymi wartościami w tych kolumnach
    filtered_X = X.copy()
    for col, val in zip(valid_columns, profile_values):
        filtered_X = filtered_X[filtered_X[col] == val]

    if not filtered_X.empty:
        # Obliczenie prawdopodobieństw klas dla liści
        dtc_probabilities = dtc.predict_proba(filtered_X)
        dtc_average_probabilities = np.mean(dtc_probabilities, axis=0)

        rf_probabilities = rf.predict_proba(filtered_X)
        rf_average_probabilities = np.mean(rf_probabilities, axis=0)

        # Dodanie prawdopodobieństw klas do DataFrame
        for class_idx in range(dtc_probabilities.shape[1]):
            leaf_info_df.loc[index, f'dtc_class_{class_idx}_probability'] = dtc_average_probabilities[class_idx]

        for class_idx in range(rf_probabilities.shape[1]):
            leaf_info_df.loc[index, f'rf_class_{class_idx}_probability'] = rf_average_probabilities[class_idx]

In [351]:
# Sortowanie najpierw po 'leaf_class' malejąco, a potem po '(1-gini)*samples' malejąco
leaf_info_df = leaf_info_df.sort_values(by=['leaf_class', '(1-gini)*samples'], ascending=[False, False])

In [352]:
leaf_info_df['(1-gini)*samples/all_samples'] = leaf_info_df['(1-gini)*samples']/X.shape[0]

In [353]:
leaf_info_df.head(5)

Unnamed: 0,node_id,gini,samples,(1-gini)*samples,leaf_class,Context_Finances,Context_CloseDeath,Context_FamilyConflict,Context_Disability,Context_HeartBreak,Context_Crime,Context_SchoolWork,Context_MentalHealth,Context_Other,Context_HealthLoss,Income_Benefits,Income_Dependent,Income_NoSteady,Income_Steady,Method_Drowning,Method_Drugs,Method_Gas,Method_Hanging,Method_Jumping,Method_Other,Method_Poisoning,Method_Schooting,Method_Self-harm,Method_Vehicle,Education_Pre-primary,Education_Primary,Education_Secondary,Education_Vocational,AbuseInfo_Alco,AbuseInfo_Alco&OtherSub,AbuseInfo_Not,AbuseInfo_OtherSub,WorkInfo_Employed,WorkInfo_Student,WorkInfo_Unemployed,Substance_Alco,Substance_Alco&OtherSub,Substance_OtherSub,Substance_Sober,Place_Forest,Place_House,Place_Institution,Place_Isolation,Place_Other,Place_PoliceArmy,Place_Railway,Place_Road,Place_School,Place_UtilitySpaces,Place_WaterRes,Place_Work,Marital_Cohabitant,Marital_Married,Marital_Single,Marital_Single.1,dtc_class_0_probability,dtc_class_1_probability,rf_class_0_probability,rf_class_1_probability,(1-gini)*samples/all_samples
137,282,0.073981,37,34.262693,1,,,0.0,,0.0,,0.0,0.0,0.0,,,,,,,,,1,,,,,,,,1.0,,,,,,,,,,0.0,,0.0,,,1.0,0.0,,,,,,,,,,,,1.0,,0.038471,0.961529,0.081791,0.918209,0.004655
123,249,0.045598,30,28.632056,1,,,,,,,,0.0,,,,,,,,,,0,,,,,,1.0,,,,,,,,,,,,,,,0.0,,,,,,,1.0,,,,,,,,,,0.023344,0.976656,0.163053,0.836947,0.00389
150,305,0.129046,32,27.870517,1,,,,,,,,1.0,,,,,,,,,,1,,,,,,,,1.0,0.0,,,,,,,,,0.0,,0.0,,,1.0,0.0,,,,,,,,,,,,,0.0,0.06933,0.93067,0.171345,0.828655,0.003786
156,317,0.025614,24,23.385257,1,,,,,0.0,,,0.0,,,,,,,,,,1,,,,,,,,1.0,,,,,,,,,,,,1.0,,,1.0,0.0,,0.0,,,,,0.0,,,,,,,0.012976,0.987024,0.077144,0.922856,0.003177
126,254,0.115745,26,22.990634,1,,,,,,,,0.0,,,,,,,,,,0,,,,,,1.0,,1.0,,,,,,,,,,,,,1.0,,,,,,,1.0,,,,,,,,,,0.061676,0.938324,0.14754,0.85246,0.003123


#Saveing data

In [354]:
!ls

'MV_age_group_[00-18]_gender_['\''F'\'']_year_[2013-2023].xlsx'
 sample_data
'TREE_graphviz_age_group_[00-18]_gender_['\''F'\'']_year_[2013-2023]'
'TREE_graphviz_age_group_[00-18]_gender_['\''F'\'']_year_[2013-2023].svg'
 TREE_graphviz.dot


In [355]:
for file in os.listdir():
    if os.path.isfile(file):
        os.remove(file)

####Saveing params

In [356]:
file_name = "MV_" + file_title + ".xlsx"

In [357]:
with pd.ExcelWriter(file_name, engine='openpyxl') as writer:
    # DTC
    dtc_all_results.to_excel(writer, sheet_name='DTC Model Validation', index=False)
    dtc_feature_validation_df.to_excel(writer, sheet_name='DTC Feature Validations', index=False)

    # RF
    rf_all_results.to_excel(writer, sheet_name='RFC Model Validation', index=False)
    rf_feature_validation_df.to_excel(writer, sheet_name='RFC Feature Validation', index=False)

    # nodes_info_df
    dtc_nodes_info_df.to_excel(writer, sheet_name='DTC Nodes Info', index=False)
    # leaf_info_df
    leaf_info_df.to_excel(writer, sheet_name='DTC Leaf Info', index=False)
    # feature_count_df
    feature_count_df.to_excel(writer, sheet_name='RFC Group Risk', index=False)






# Pobierz plik Excel
files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

####Saveing tree graphviz

In [358]:
file_name = "TREE_graphviz_" + file_title
graphviz.Source(dot_graph)
graphviz.Source(dot_graph).render(file_name, format="svg")
files.download(file_name + '.svg')

Error: not well-formed (invalid token) in line 1 
... <br/>Substance_Alco&OtherSub &le; 0.5 ...
in label of node 120


CalledProcessError: Command '[PosixPath('dot'), '-Kdot', '-Tsvg', '-O', "TREE_graphviz_age_group_[00-18]_gender_['F']_year_[2013-2023]"]' returned non-zero exit status 1. [stderr: b'Error: not well-formed (invalid token) in line 1 \n... <br/>Substance_Alco&OtherSub &le; 0.5 ...\nin label of node 120\n']