In [26]:
import sys
import statistic_lib as st
import numpy as np
import pandas as pd
import csv

In [27]:
df = pd.read_csv('../datasets/dataset_train.csv')

df.dropna(inplace=True)

df_numeric = df.select_dtypes(include=['number']).drop(columns=['Index', 'Care of Magical Creatures', 'Arithmancy', 'Astronomy'])

In [32]:
nb_exemples = len(df_numeric)

features = df_numeric.columns.tolist()
nb_features = len(features)

classes = df['Hogwarts House'].unique().tolist()
nb_classes = len(classes)

In [33]:
Y = np.zeros((nb_exemples, nb_classes))

classe_to_index = {classe: index for index, classe in enumerate(classes)}

for i, classe in enumerate(df['Hogwarts House']):
    Y[i, classe_to_index[classe]] = 1

In [34]:
features.append(('Bias'))
W = pd.DataFrame(columns=classes, index=features)
W = W.infer_objects(copy=False).fillna(0)
W

Unnamed: 0,Ravenclaw,Slytherin,Gryffindor,Hufflepuff
Herbology,0.0,0.0,0.0,0.0
Defense Against the Dark Arts,0.0,0.0,0.0,0.0
Divination,0.0,0.0,0.0,0.0
Muggle Studies,0.0,0.0,0.0,0.0
Ancient Runes,0.0,0.0,0.0,0.0
History of Magic,0.0,0.0,0.0,0.0
Transfiguration,0.0,0.0,0.0,0.0
Potions,0.0,0.0,0.0,0.0
Charms,0.0,0.0,0.0,0.0
Flying,0.0,0.0,0.0,0.0


In [6]:
X = np.array((df_numeric.apply(lambda x: (x - x.mean()) / x.std())))

In [7]:
ones = np.ones(X.shape[0])
X = np.column_stack((X,ones))

### Fonction Softmax

In [8]:
def softmax(Z):
    exp_z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

On soustrait le maximum de chaque ligne de Z avant d'appliquer l'exponentielle.
Cela aide à stabiliser les calculs numériques et à éviter les débordements.
<b>On divise chaque élément exponentié de Z par la somme des exponentielles de sa ligne.</b> Cela transforme chaque ligne de Z en une distribution de probabilité (les éléments de chaque ligne s'additionnent à 1).

### Fonction Coût : entropie croisée

In [9]:
def cross_entropy(Y, Y_hat):
    return -np.sum(Y * np.log(Y_hat + 1e-9), axis=1)
    # vecteur contenant toutes les sommes des probabilités d'un exemples (d'une ligne)

def cost_function(Y, Y_hat):
    total_cost = np.sum(cross_entropy(Y, Y_hat))
    # la somme de toutes les entropies croisées de chaque exemples
    return total_cost / nb_exemples

### Descente de gradient

In [10]:
def compute_gradients(X, Y, Y_hat):
    dW = np.dot(X.T, (Y_hat - Y)) / nb_exemples
    db = np.sum(Y_hat - Y, axis=0, keepdims=True) / nb_exemples
    return dW, db

In [11]:
def gradient_descent(W, X, Y, learning_rate=0.01, num_iterations=50000):

    for i in range(num_iterations):
        Z = np.dot(X, W)

        Y_hat = softmax(Z)
        
        cost = cost_function(Y, Y_hat)
        
        dW, db = compute_gradients(X, Y, Y_hat)
        
        W -= learning_rate * dW

        if i % 1000 == 0:
            print(i)
            
    print('descente de gradient finit')
    return W, Y_hat, cost

In [12]:
W, Y_hat, cost = gradient_descent(W, X, Y, learning_rate=0.01, num_iterations=10000)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
descente de gradient finit


In [13]:
with open('result_train_weights.csv', 'w', newline='') as file:
    W.to_csv(file)

In [14]:
# from sklearn.metrics import accuracy_score

# Y_hat_rounded = np.round(Y_hat)
# accuracy_score(Y, Y_hat_rounded)

ModuleNotFoundError: No module named 'sklearn'