In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model  import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.under_sampling import RandomUnderSampler

In [None]:
def fpc(model, test_train_data):
    X_train, X_test, y_train, y_test = test_train_data
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.show()
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred))
    print('f1_score: ', f1_score(y_test, y_pred))

In [None]:
data = pd.read_csv('data/data_cleaned.csv')

In [None]:
sns.boxplot(data['err'], width=0.3)

In [None]:
sns.countplot(x ='present', data = data, palette='hls')
plt.show()

In [None]:
X = data.drop('present', axis=1)
Y = data['present']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
test_train_data = (X_train, X_test, y_train, y_test)
class_weight = compute_class_weight('balanced', classes=data['present'].unique(), y=data['present'])
class_weight = {0: class_weight[1], 1: class_weight[0]}

In [None]:
fpc(LogisticRegression(class_weight=class_weight), test_train_data)

In [None]:
fpc(RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, class_weight=class_weight), test_train_data)

In [None]:
fpc(SGDClassifier(loss='modified_huber', class_weight=class_weight), test_train_data)

In [None]:
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, Y)
gb = GradientBoostingClassifier(learning_rate=0.1, max_depth=10, random_state=42)
fpc(gb, train_test_split(X_resampled, y_resampled, test_size=0.3))