In [66]:
import pandas as pd
import numpy as np
from instructions import load_df_from_pkl, save_df_as_pkl
from handling_outliers import removing_iqr, removing_percentiles, zscore_outlier, modified_z_score_outlier, count_outliers, mask_outliers, replace_missing_values
import os


path = os.path.split(os.getcwd())
data_directory = os.path.join(path[0], 'data\\raw')
data_directory

'c:\\Users\\Marta\\Desktop\\Studia\\CDV\\IV semestr 2022L\\Wykorzystanie Pythona w uczeniu maszynowym\\ml_project\\project\\ML_PROJECT_2022\\data\\raw'

In [67]:
y = load_df_from_pkl(data_directory, 'y')
X = load_df_from_pkl(data_directory, 'X')

In [68]:
outliers_methods_dict = {
    "Removing 0.1 & 0.9" : removing_percentiles,
    "IQR" : removing_iqr,
    "Z-score" : zscore_outlier,
    "Modified Z-score" : modified_z_score_outlier
}

In [69]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
X_copy_3 = X.copy()
X_std = std.fit_transform(X_copy_3)

In [36]:
for method_name, method in outliers_methods_dict.items():
    X_rem = method(pd.DataFrame(X_std))
    counted = count_outliers(X_rem)
    contamination = counted[0].sum() / (X_rem.shape[0] * X_rem.shape[1]) * 100
    #masked = mask_outliers(pd.DataFrame(X_std), X_rem)
    #replaced = replace_missing_values(masked, 5)
    print(method_name)
    print(counted[1])
    print(f'Dataset contamination: {round(contamination, 2)} %')
    print('_'*10)

Removing 0.1 & 0.9
(0       2028
1       2013
2       1942
3       1951
4       1997
        ... 
3745    1899
3746    1969
3747    1977
3748    1899
3749    1975
Length: 3750, dtype: int64,            0
1806  0.2131
2422  0.2130
1692  0.2128
1905  0.2126
2410  0.2124
...      ...
2906  0.1873
207   0.1873
1441  0.1867
338   0.1856
662   0.1851

[3750 rows x 1 columns])
Dataset contamination: 20.0 %
__________
IQR
(0       82
1       77
2       51
3       62
4       80
        ..
3745    68
3746    65
3747    79
3748    78
3749    68
Length: 3750, dtype: int64,            0
3667  0.0105
2844  0.0099
2524  0.0098
2820  0.0097
3458  0.0096
...      ...
3204  0.0048
7     0.0048
909   0.0047
3594  0.0046
345   0.0046

[3750 rows x 1 columns])
Dataset contamination: 0.71 %
__________
Z-score
(0       30
1       30
2       16
3       28
4       36
        ..
3745    27
3746    30
3747    24
3748    32
3749    23
Length: 3750, dtype: int64,            0
3185  0.0047
479   0.0046
3667  0.0045

In [70]:
X_rem = removing_iqr(pd.DataFrame(X_std))
counted = count_outliers(X_rem)
contamination = counted[0].sum() / (X_rem.shape[0] * X_rem.shape[1]) * 100
masked = mask_outliers(pd.DataFrame(X_std), X_rem)
replaced = replace_missing_values(masked, 5)
print(counted)
print(f'Dataset contamination: {round(contamination, 2)} %')
print('_'*10)

(0       82
1       77
2       51
3       62
4       80
        ..
3745    68
3746    65
3747    79
3748    78
3749    68
Length: 3750, dtype: int64,            0
3667  0.0105
2844  0.0099
2524  0.0098
2820  0.0097
3458  0.0096
...      ...
3204  0.0048
7     0.0048
909   0.0047
3594  0.0046
345   0.0046

[3750 rows x 1 columns])
Dataset contamination: 0.71 %
__________


In [38]:
save_df_as_pkl(replaced, data_directory, 'X_preprocessed')

###Baseline

In [71]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.model_selection import cross_val_score, cross_val_predict


dummy_clf = DecisionTreeClassifier(random_state=42)

y_pred = cross_val_predict(dummy_clf, X, y, cv=5)

In [44]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y, y_pred)
cm

array([[ 264,  111],
       [  91, 3284]], dtype=int64)

In [72]:
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score

precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
acc = balanced_accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred, average='weighted')
roc_score = roc_auc_score(y, y_pred, average='weighted')

precision, recall, acc, f1, roc_score

(0.9673048600883652,
 0.973037037037037,
 0.8385185185185184,
 0.9454750005058578,
 0.8385185185185184)

In [73]:
clf = DecisionTreeClassifier(random_state=42)
y_pred = cross_val_predict(clf, replaced, y, cv=5)
cm = confusion_matrix(y, y_pred)
cm

array([[ 273,  102],
       [ 122, 3253]], dtype=int64)

In [74]:
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score, f1_score, roc_auc_score

precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
acc = balanced_accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred, average='weighted')
roc_score = roc_auc_score(y, y_pred, average='weighted')

precision, recall, acc, f1

(0.9695976154992548, 0.9638518518518518, 0.845925925925926, 0.9409536674321222)