In [193]:
import sys
import statistic_lib as st
import numpy as np
import pandas as pd
import csv

In [194]:
df = pd.read_csv('../datasets/dataset_train.csv')

df.dropna(inplace=True)

df_numeric = df.select_dtypes(include=['number']).drop(columns=['Index', 'Care of Magical Creatures', 'Arithmancy', 'Astronomy'])

In [211]:
nb_exemples = len(df_numeric)

features = df_numeric.columns.tolist()
nb_features = len(features)

classes = df['Hogwarts House'].unique().tolist()
nb_classes = len(classes)
classes.remove('Slytherin')
classes.append('Slytherin')
classes

['Ravenclaw', 'Gryffindor', 'Hufflepuff', 'Slytherin']

In [212]:
Y = np.zeros((nb_exemples, nb_classes))

classe_to_index = {classe: index for index, classe in enumerate(classes)}

for i, classe in enumerate(df['Hogwarts House']):
    Y[i, classe_to_index[classe]] = 1
Y

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       ...,
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [213]:
features.append(('Bias'))
W = pd.DataFrame(columns=classes, index=features)
W = W.infer_objects(copy=False).fillna(0)
W.shape

(11, 4)

In [214]:
X = np.array((df_numeric.apply(lambda x: (x - x.mean()) / x.std())))

In [215]:
X.shape

(1251, 10)

In [216]:
ones = np.ones(X.shape[0])
X = np.column_stack((X,ones))

print(X.shape)
print(X)

(1251, 11)
[[ 0.86877347  1.02575399  0.3668627  ...  1.19808724 -0.51147256
   1.        ]
 [-1.37405992  1.14894221 -2.14688607 ... -1.00774958 -1.39695141
   1.        ]
 [ 1.25127087  0.79192947  0.71179165 ...  1.81824066  0.07478891
   1.        ]
 ...
 [-0.84101312 -0.95507478  0.69354787 ... -0.94547031  1.79388227
   1.        ]
 [ 0.43142147 -0.78165433  0.85725534 ... -0.32592323 -1.02213689
   1.        ]
 [ 0.81016221 -1.23320925  0.82198404 ... -0.07905294 -0.79667531
   1.        ]]


### Fonction Softmax

In [217]:
def softmax(Z):
    exp_z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

On soustrait le maximum de chaque ligne de Z avant d'appliquer l'exponentielle.
Cela aide à stabiliser les calculs numériques et à éviter les débordements.
<b>On divise chaque élément exponentié de Z par la somme des exponentielles de sa ligne.</b> Cela transforme chaque ligne de Z en une distribution de probabilité (les éléments de chaque ligne s'additionnent à 1).

### Fonction Coût : entropie croisée

In [218]:
def cross_entropy(Y, Y_hat):
    o = -np.sum(Y * np.log(Y_hat + 1e-9), axis=1)
    # o est un vecteur contenant toutes les sommes des probabilités d'un exemples (d'une ligne)
    return o

def cost_function(Y, Y_hat):
    total_cost = np.sum(cross_entropy(Y, Y_hat))
    # la somme de toutes les entropies croisées de chaque exemples
    return total_cost / nb_exemples

### Descente de gradient

In [219]:
def compute_gradients(X, Y, Y_hat):
    dW = np.dot(X.T, (Y_hat - Y)) / nb_exemples
    db = np.sum(Y_hat - Y, axis=0, keepdims=True) / nb_exemples
    return dW, db

In [225]:
def gradient_descent(W, X, Y, learning_rate=0.01, num_iterations=50000):

    for i in range(num_iterations):
        Z = np.dot(X, W) # combinaison linéaire

        Y_hat = softmax(Z)
        
        cost = cost_function(Y, Y_hat)
        
        dW, db = compute_gradients(X, Y, Y_hat)
        
        W -= learning_rate * dW
    return W, Y_hat, cost

In [229]:
W, Y_hat, cost = gradient_descent(W, X, Y, learning_rate=0.01, num_iterations=1000)

In [230]:
Y_hat
# W = 1 - W
W

Unnamed: 0,Ravenclaw,Gryffindor,Hufflepuff,Slytherin
Herbology,0.24625,-0.341056,0.520088,-0.425281
Defense Against the Dark Arts,0.471622,-0.157734,-0.693129,0.379241
Divination,0.161221,0.125863,0.351832,-0.638915
Muggle Studies,0.640323,-0.079428,-0.416166,-0.144729
Ancient Runes,0.476189,0.327511,-0.610816,-0.192884
History of Magic,0.1033,-0.454357,0.308344,0.042713
Transfiguration,0.111047,-0.465991,0.272237,0.082707
Potions,0.089222,-0.189253,-0.230198,0.330229
Charms,0.571213,-0.228979,-0.105708,-0.236525
Flying,-0.053669,0.436766,-0.163297,-0.2198


In [231]:
with open('result_train_weights.csv', 'w', newline='') as file:
    W.to_csv(file)