In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [13]:

# 1️⃣ Charger les données (remplace "dataset.csv" par ton fichier)
df = pd.read_csv('./datasets/dataset_train.csv')
print(df.shape)

(1600, 19)


In [14]:
df.dropna(inplace=True,axis=0)
df.drop(columns=['Index'], inplace=True)
print(df.shape)


(1251, 18)


In [15]:
# Sélectionner une ligne aléatoire
random_row = df.sample(n=1)

# Supprimer cette ligne du dataframe original
df.drop(random_row.index, inplace=True)

# Afficher la ligne aléatoire sélectionnée
random_row

Unnamed: 0,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
374,Hufflepuff,Francisca,Chester,1999-11-14,Right,46440.0,432.249906,3.173215,-4.322499,6.177,-485.573158,391.510524,5.950994,1054.707139,5.394657,0.054452,-243.45066,-39.76


In [16]:
print(df.shape)

(1250, 18)


In [17]:
class LogisticRegressionOVR:
    def __init__(self, learning_rate=0.01, max_iter=1000, class_encoder=None):
        """
        Initialise le modèle avec un taux d'apprentissage et un nombre d'itérations pour la descente de gradient.
        """
        self.claas_encoder = class_encoder or {}
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.thetas = []  # Liste pour stocker les poids de chaque modèle
        self.classes_ = None  # Stocker les noms des classes

    def sigmoid(self, z):
        """
        Fonction sigmoïde : transforme n'importe quelle valeur en un score entre 0 et 1.
        """
        return 1 / (1 + np.exp(-z))

    def cost_function(self, X, y, theta):
        """
        Fonction de coût : mesure l'erreur entre les prédictions et les vraies valeurs.
        """
        m = len(y)
        h = self.sigmoid(X @ theta)
        return (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

    def gradient_descent(self, X, y, theta, class_name):
        """
        Applique la descente de gradient pour trouver les poids optimaux.
        """
        m = len(y)
        for _ in range(self.max_iter):
            print(f"Loss({class_name}): {self.cost_function(X, y, theta)}")
            h = self.sigmoid(X @ theta)
            gradient = (1 / m) * X.T @ (h - y)  # Gradient de la fonction de coût
            theta -= self.learning_rate * gradient  # Mise à jour des poids
        return theta

    def fit(self, X, y):
        """
        Entraîne le modèle en utilisant la méthode One-vs-All.
        """
        m, n = X.shape
        self.classes_ = np.unique(y)  # Liste des classes uniques
        self.thetas = np.zeros((len(self.classes_), n + 1))  # Initialisation des poids
        X_bias = np.c_[np.ones((m, 1)), X]  # Ajout d'un biais (colonne de 1)

        for i, c in enumerate(self.classes_):
            y_binary = (y == c).astype(int)  # Convertir en problème binaire (1 si c'est la classe, sinon 0)
            theta = np.zeros(n + 1)  # Initialiser les poids
            self.thetas[i] = self.gradient_descent(X_bias, y_binary, theta, c)  # Entraîner le modèle

    def predict_proba(self, X):
        """
        Retourne les probabilités pour chaque classe.
        """
        X_bias = np.c_[np.ones((X.shape[0], 1)), X]  # Ajouter le biais
        return self.sigmoid(X_bias @ self.thetas.T)  # Matrice de probabilités

    def predict(self, X):
        """
        Prédit la classe en prenant celle avec la plus grande probabilité.
        """
        probabilities = self.predict_proba(X)
        return self.classes_[np.argmax(probabilities, axis=1)]  # Retourne la classe avec la proba max



In [18]:

# 2️⃣ Sélectionner les features pertinentes
# features = ["Defense Against the Dark Arts", "Transfiguration", "Herbology", "Potions", "Ancient Runes"]
features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
X = df[features]
y = df["Hogwarts House"]  # La variable cible
print(X.shape, y.shape)


(1250, 13) (1250,)


In [19]:

# 3️⃣ Normalisation des données
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [20]:

# 4️⃣ Séparation des données en train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [21]:

model = LogisticRegressionOVR(learning_rate=0.01, max_iter=10000)
model.fit(X_train, y_train)

Loss(Gryffindor): 0.6931471805599454
Loss(Gryffindor): 0.6858679986561843
Loss(Gryffindor): 0.6787416419575932
Loss(Gryffindor): 0.671764475185444
Loss(Gryffindor): 0.6649329067291967
Loss(Gryffindor): 0.6582433928757735
Loss(Gryffindor): 0.6516924415799078
Loss(Gryffindor): 0.6452766157933124
Loss(Gryffindor): 0.6389925363721751
Loss(Gryffindor): 0.632836884583848
Loss(Gryffindor): 0.6268064042345681
Loss(Gryffindor): 0.6208979034406907
Loss(Gryffindor): 0.6151082560662349
Loss(Gryffindor): 0.6094344028496166
Loss(Gryffindor): 0.6038733522422579
Loss(Gryffindor): 0.5984221809813903
Loss(Gryffindor): 0.593078034418835
Loss(Gryffindor): 0.5878381266268495
Loss(Gryffindor): 0.5826997403013469
Loss(Gryffindor): 0.5776602264819098
Loss(Gryffindor): 0.5727170041070836
Loss(Gryffindor): 0.567867559422441
Loss(Gryffindor): 0.5631094452579012
Loss(Gryffindor): 0.5584402801897596
Loss(Gryffindor): 0.5538577476018637
Loss(Gryffindor): 0.5493595946593559
Loss(Gryffindor): 0.5449436312074147
Loss(

In [65]:

# 6️⃣ Prédictions
y_pred = model.predict(X_test)

with open("y_pred.txt", "w") as file:
    file.write(f"test, pred\n")
    for pred, test in zip(y_pred, y_test):
        file.write(f"{test}, {pred}\n")

# 7️⃣ Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 Accuracy: {accuracy:.2%}")


🎯 Accuracy: 98.00%


In [66]:
feature_tets = random_row[features].values
print(feature_tets)
feature_tets_scaled = scaler.transform(feature_tets)
print(feature_tets_scaled)


[[ 6.29570000e+04  4.62402227e+02  3.44751961e+00 -4.62402227e+00
   5.94600000e+00 -8.03793394e+02  3.78226194e+02  7.20915724e+00
   1.08056716e+03  7.32585282e+00  3.91128843e-01 -2.44701800e+02
  -7.81600000e+01]]
[[ 0.80939969  0.79924735  0.43269739 -0.79924735  0.66525041 -1.19939293
  -1.10745848  0.95030542  1.15435542  0.44015813  0.46866838 -0.15666988
  -1.03722138]]




In [67]:
test_pred = model.predict(feature_tets_scaled)
print(f"Prediction: {test_pred[0]}")

Prediction: Hufflepuff


In [14]:
model.classes_

array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin'], dtype=object)

In [54]:
tmp = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=1000)
tmp.coef_ = model.coef_

In [55]:
test_pred = tmp.predict(feature_tets_scaled)
print(f"Prediction: {test_pred[0]}")

AttributeError: 'LogisticRegression' object has no attribute 'intercept_'