In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [44]:

# 1️⃣ Charger les données (remplace "dataset.csv" par ton fichier)
df = pd.read_csv('./datasets/dataset_train.csv')
print(df.shape)

(1600, 19)


In [45]:
df.dropna(inplace=True,axis=0)
df.drop(columns=['Index'], inplace=True)
print(df.shape)


(1251, 18)


In [46]:
# Sélectionner une ligne aléatoire
random_row = df.sample(n=1)

# Supprimer cette ligne du dataframe original
df.drop(random_row.index, inplace=True)

# Afficher la ligne aléatoire sélectionnée
random_row

Unnamed: 0,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
601,Ravenclaw,Ignacio,Nagy,2000-06-23,Right,47484.0,-739.454637,3.820398,7.394546,2.78,437.137959,579.129076,6.014226,1043.941479,7.46229,-1.40703,-231.38048,-23.2


In [47]:
print(df.shape)

(1250, 18)


In [48]:

# 2️⃣ Sélectionner les features pertinentes
# features = ["Defense Against the Dark Arts", "Transfiguration", "Herbology", "Potions", "Ancient Runes"]
features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
X = df[features]
y = df["Hogwarts House"]  # La variable cible
print(X.shape, y.shape)


(1250, 13) (1250,)


In [49]:

# 3️⃣ Normalisation des données
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [50]:

# 4️⃣ Séparation des données en train/test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 5️⃣ Entraîner le modèle One-vs-All
model = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=1000)
model.fit(X_train, y_train)

# 6️⃣ Prédictions
y_pred = model.predict(X_test)

with open("y_pred.txt", "w") as file:
    file.write(f"test, pred\n")
    for pred, test in zip(y_pred, y_test):
        file.write(f"{test}, {pred}\n")

# 7️⃣ Évaluation du modèle
accuracy = accuracy_score(y_test, y_pred)
print(f"🎯 Accuracy: {accuracy:.2%}")


🎯 Accuracy: 98.80%




In [51]:
feature_tets = random_row[features].values
print(feature_tets)
feature_tets_scaled = scaler.transform(feature_tets)
print(feature_tets_scaled)


[[ 4.74840000e+04 -7.39454637e+02  3.82039834e+00  7.39454637e+00
   2.78000000e+00  4.37137959e+02  5.79129076e+02  6.01422631e+00
   1.04394148e+03  7.46229017e+00 -1.40703048e+00 -2.31380480e+02
  -2.32000000e+01]]
[[-0.11799476 -1.51124559  0.50416314  1.51124559 -0.10561431  1.36305006
   0.7774668   0.68177045  0.32036221  0.48417525 -1.3826357   1.36098909
  -0.47414672]]




In [None]:
test_pred = model.predict(feature_tets_scaled)
print(f"Prediction: {test_pred[0]}")

Prediction: Ravenclaw


In [53]:
model.coef_

array([[-0.14958016,  0.30181384, -0.97480441, -0.30181384,  1.09966065,
        -0.26693127,  1.02457927, -0.51091893, -1.20398813, -0.05145651,
        -0.00794398, -0.14308356,  0.74212572],
       [ 0.21713469,  1.28210571,  1.59501731, -1.28210571,  0.38784568,
        -0.84767651, -1.3269499 ,  0.89563959,  0.60154311, -0.41766014,
         0.07959232, -0.05529462, -0.32357844],
       [ 0.09361539, -0.79228793,  0.41486363,  0.79228793,  0.24051721,
         1.47178588,  0.83371469,  0.17895437, -0.08389997, -0.21628123,
        -0.07771055,  0.97864744, -0.0980435 ],
       [-0.07292958, -0.7754538 , -0.35186418,  0.7754538 , -1.74430744,
        -0.87847255, -0.52747585, -0.24166145,  0.78084294,  1.03349095,
        -0.08731514, -0.55696321, -0.13211939]])

In [54]:
tmp = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=1000)
tmp.coef_ = model.coef_

In [55]:
test_pred = tmp.predict(feature_tets_scaled)
print(f"Prediction: {test_pred[0]}")

AttributeError: 'LogisticRegression' object has no attribute 'intercept_'

In [None]:
import numpy as np

class LogisticRegressionOVR:
    def __init__(self, learning_rate=0.01, max_iter=1000):
        self.learning_rate = learning_rate
        self.max_iter = max_iter
        self.thetas = None  # Matrice des poids (une ligne par classe)

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def cost_function(self, X, y, theta):
        """ Fonction de coût logistique """
        m = len(y)
        h = self.sigmoid(X @ theta)
        return (-1 / m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

    def gradient_descent(self, X, y, theta):
        """ Descente de gradient """
        m = len(y)
        for _ in range(self.max_iter):
            gradient = (1 / m) * (X.T @ (self.sigmoid(X @ theta) - y))
            theta -= self.learning_rate * gradient
        return theta

    def fit(self, X, y):
        """ Entraînement One-vs-All """
        m, n = X.shape
        classes = np.unique(y)
        self.thetas = np.zeros((len(classes), n))

        for i, c in enumerate(classes):
            y_binary = (y == c).astype(int)  # Convertir en classification binaire
            theta = np.zeros(n)  # Initialiser theta à 0
            self.thetas[i] = self.gradient_descent(X, y_binary, theta)

    def predict(self, X):
        """ Prédiction de la classe la plus probable """
        probs = self.sigmoid(X @ self.thetas.T)  # Prédictions pour chaque classe
        return np.argmax(probs, axis=1)  # Retourne l'indice de la classe avec la proba max