In [175]:
import sys
import statistic_lib as st
import numpy as np
import pandas as pd
import csv

In [176]:
def scale(df):
    titles = df.columns
    for col in titles:
        arr = df[col]
        X_std = st.std(arr)
        mean_X = st.mean(arr)
        if X_std != 0:
            scaled_X = (arr - mean_X) / X_std
        df[col] = scaled_X
    return df

In [214]:
df = pd.read_csv('../datasets/dataset_train.csv')

df = df.dropna()

df_numeric = df.select_dtypes(include=['number']).drop(columns=['Index', 'Care of Magical Creatures', 'Arithmancy', 'Astronomy'])

In [215]:
nb_exemples = len(df_numeric)
nb_features = len(df_numeric.columns)
nb_classes = 4

In [216]:
Y = np.zeros((nb_exemples, nb_classes))

houses = ['Gryffindor', 'Slytherin', 'Ravenclaw', 'Hufflepuff']

house_to_index = {house: index for index, house in enumerate(houses)}

for i, house in enumerate(df['Hogwarts House']):
    Y[i, house_to_index[house]] = 1
Y

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [217]:
X = np.array(scale(df_numeric))

In [218]:
X.shape

(1251, 10)

In [219]:
ones = np.ones(X.shape[0])
X = np.column_stack((ones,X))

print(X.shape)
print(X)

(1251, 11)
[[ 1.          0.86912091  1.02616421 ... -0.69991363  1.19856638
  -0.51167711]
 [ 1.         -1.37460944  1.1494017  ...  0.4150871  -1.0081526
  -1.39751008]
 [ 1.          1.25177128  0.79224618 ...  0.89218017  1.81896781
   0.07481882]
 ...
 [ 1.         -0.84134946 -0.95545673 ... -1.37069927 -0.94584842
   1.79459968]
 [ 1.          0.43159401 -0.78196693 ... -0.66324604 -0.32605358
  -1.02254566]
 [ 1.          0.81048621 -1.23370243 ... -1.26030525 -0.07908455
  -0.79699392]]


### Fonction Softmax

In [220]:
def softmax(Z):
    exp_z = np.exp(Z - np.max(Z, axis=1, keepdims=True))  # Numerical stability
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

On soustrait le maximum de chaque ligne de Z avant d'appliquer l'exponentielle.
Cela aide à stabiliser les calculs numériques et à éviter les débordements.
<b>On divise chaque élément exponentié de Z par la somme des exponentielles de sa ligne.</b> Cela transforme chaque ligne de Z en une distribution de probabilité (les éléments de chaque ligne s'additionnent à 1).

### Fonction Coût : entropie croisée

In [221]:
def cross_entropy(Y, Y_hat):
    o = -np.sum(Y * np.log(Y_hat + 1e-9), axis=1)
    # o est un vecteur contenant toutes les sommes des probabilités d'un exemples (d'une ligne)
    return o

def cost_function(Y, Y_hat):
    total_cost = np.sum(cross_entropy(Y, Y_hat))
    # la somme de toutes les entropies croisées de chaque exemples
    return total_cost / nb_exemples

### Descente de gradient

In [222]:
def compute_gradients(X, Y, Y_hat):
    dW = np.dot(X.T, (Y_hat - Y)) / nb_exemples
    db = np.sum(Y_hat - Y, axis=0, keepdims=True) / nb_exemples
    return dW, db

In [236]:
def gradient_descent(X, Y, learning_rate=0.01, num_iterations=1000):
    biais = np.zeros((1, nb_classes))

    sequence = np.arange(0.1, 0.4, 0.1)
    W = np.full((nb_features,nb_classes), sequence)
    W = np.row_stack((biais,W))

    for i in range(num_iterations):
        Z = np.dot(X, W) # combinaison linéaire

        Y_hat = softmax(Z)
        
        cost = cost_function(Y, Y_hat)
        
        dW, db = compute_gradients(X, Y, Y_hat)
        
        W -= learning_rate * dW
    return W, Y_hat

In [237]:
W, Y_hat = gradient_descent(X, Y, learning_rate=0.01, num_iterations=10000)
Y_hat

array([[1.18374224e-02, 1.08134425e-02, 9.48607610e-01, 2.87415254e-02],
       [6.16612665e-04, 9.93316076e-01, 3.48245327e-03, 2.58485853e-03],
       [3.65843979e-03, 2.51981371e-03, 9.89742687e-01, 4.07905902e-03],
       ...,
       [9.94077209e-01, 4.06812712e-04, 2.37813621e-03, 3.13784206e-03],
       [1.82633536e-03, 3.34395795e-03, 1.57537834e-03, 9.93254328e-01],
       [1.43446875e-03, 5.51887238e-04, 2.09377882e-03, 9.95919865e-01]])

In [235]:
with open('result_train_weights.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(houses)
    writer.writerows(W)