In [None]:
import numpy as np
import pandas as pd

In [None]:
dataset_path = '../../datasets/student_performance.csv'

In [None]:
df = pd.read_csv(dataset_path, delimiter=',')
df.head()

In [None]:
def two_str_value_to_int(df:pd.DataFrame, columns:list):
    for column in columns:
        # unique_values = list(df[column].unique())
        if 'yes' in list(df[column].unique()):
            unique_values = ['no', 'yes']
        # if len(unique_values) == 2:
            df[column] = df[column].apply(lambda x: unique_values.index(str(x).lower()))
    return df


In [None]:
str_columns = [
    "school", "sex", "address", "famsize", "Pstatus", "Mjob", "Fjob", 
    "reason", "guardian", "schoolsup", "famsup", "paid", "activities", 
    "nursery", "higher", "internet", "romantic"
]

df = two_str_value_to_int(df, str_columns)
df[str_columns].head()

In [None]:
for column in str_columns:
    print(f'column = {column} || unique values = {list(df[column].unique())}')


In [None]:
df = pd.get_dummies(df)

In [None]:
before = len(df)
df = df.drop_duplicates()
after = len(df)

print(f'Count before drop duplicates: {before}')
print(f'Count after drop duplicates: {after}')

| Nota (0 a 10) | Nota GPA (Escala de 4.0) | Nota Letra |
|---------------------|--------------------------|------------|
| 9.0 a 10.0          | 4.0                      | A          |
| 8.0 a 8.9           | 3.0 - 3.9                | B          |
| 7.0 a 7.9           | 2.0 - 2.9                | C          |
| 6.0 a 6.9           | 1.0 - 1.9                | D          |
| 0 a 5.9             | 0.0                      | F          |


In [None]:
df['G2'].unique()

In [None]:
df['GPA_result'] = df['G2'].apply(lambda x: 'PASS' if int(x)/5 >= 2 else 'FAIL')

In [None]:
# df = df.drop(columns=['G1', 'G2', 'G3'])

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
columns = []
for c in df.columns:
    if c.startswith('Fjob'):
        columns.append(c)

df[columns].head()

In [None]:
columns_to_X = [
    'age',
    'sex_F',
    'sex_M',
    'paid',
    'health',
    'goout',
    'studytime',
    'freetime',
    'activities',
    'internet',
    'romantic',
    'famrel',
    'absences',
    'failures',
# ----------------------------
    # 'guardian_father',
    # 'guardian_mother',
    # 'guardian_other',
    # 'school_GP',
    # 'school_MS',
    'schoolsup',
    # 'reason_course',
    # 'reason_home',
    # 'reason_other',
    # 'reason_reputation',
]



In [None]:
# columns_to_X = df.columns.to_list()
# columns_to_X.remove('GPA_result')

In [None]:
X = df[columns_to_X]
y = df['GPA_result'].map({'PASS':1, 'FAIL':0})
# y = y.map({'PASS':1, 'FAIL':0})

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.neural_network import MLPClassifier

# Instancia um classificador tipo rede neural com, no máximo, 2000 épocas
model = MLPClassifier(
    random_state=1, 
    max_iter=5000,
    hidden_layer_sizes=(150, 80, 50),
    solver='lbfgs',
    learning_rate='adaptive',
    shuffle=True
    )

model.fit(X_train, y_train)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

# Predição
y_pred = model.predict(X_test)

# Matriz de confusão
print('Matriz de Confusão')
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_test.unique())
disp.plot()

# Acurácia
acc = accuracy_score(y_test, y_pred)
print("Acurácia: {:.2f}".format(acc))

# Precisão média ponderada
prem = precision_score(y_true = y_test, y_pred = y_pred, average = "weighted")
print("Precisão média ponderada: {:.2f}".format(prem))

# Recall médio ponderado
recm = recall_score(y_true = y_test, y_pred = y_pred, average = "weighted")
print("Recall médio ponderado: {:.2f}".format(recm))

# F1 
f1 = f1_score(y_true = y_test, y_pred = y_pred, pos_label = 0)
print("F1 (FAIL): {:.2f}".format(f1))
# F1 
f1 = f1_score(y_true = y_test, y_pred = y_pred, pos_label = 1)
print("F1 (PASS): {:.2f}".format(f1))

# F1 médio ponderado
f1m = f1_score(y_true = y_test, y_pred = y_pred, average = "weighted")
print("F1 médio ponderado: {:.2f}".format(f1m))


##### Melhor resultado
F1 médio ponderado: 0.78
Label | value | value |
|--|--|--|
0 | 12 | 17
1 | 11 | 90


In [None]:
# import joblib

In [None]:
# joblib.dump(model, 'house_price_linear_regression_model.pkl')
