In [67]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras_tuner import RandomSearch
from keras.src.optimizers import Adam
from keras.src.layers import Dense
from keras import Sequential
from keras.src.layers import Lambda
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [27]:
df = pd.read_csv("fr-en-reussite-au-baccalaureat-origine-sociale.csv", encoding="utf-8", delimiter=';')
df_copy = df.copy()
df_copy.head()

Unnamed: 0,num_ligne,Année,Origine sociale,Nombre d'admis au baccalaureat général,Pourcentage d'admis au baccalaureat général,Nombre d'admis au baccalauréat technologique,Pourcentage d'admis au baccalauréat technologique,Nombre d'admis au baccalauréat professionnel,Pourcentage d'admis au baccalauréat professionnel,Nombre d'admis au baccalauréat,Pourcentage d'admis au baccalauréat
0,3.0,1997,Autres personnes sans activité professionnelle,8844,66.4,8679,71.2,5883,73.7,23406,69.9
1,5.0,1997,"Cadres, professions intellectuelles supérieure...",16326,85.7,1994,81.3,499,79.8,18819,85.1
2,8.0,1997,Indéterminé,4326,61.9,4334,67.0,5922,72.5,14582,67.4
3,12.0,1997,Retraités,5650,69.8,4819,72.5,3957,75.2,14426,72.1
4,14.0,1998,"Artisans, commerçants, chefs d'entreprise",26226,77.5,14619,80.9,8054,80.0,48899,78.9


In [28]:
df_copy = df_copy.drop("num_ligne", axis=1)
scaler = StandardScaler()
df_copy['Année'] = scaler.fit_transform(df_copy[['Année']])
df_copy

Unnamed: 0,Année,Origine sociale,Nombre d'admis au baccalaureat général,Pourcentage d'admis au baccalaureat général,Nombre d'admis au baccalauréat technologique,Pourcentage d'admis au baccalauréat technologique,Nombre d'admis au baccalauréat professionnel,Pourcentage d'admis au baccalauréat professionnel,Nombre d'admis au baccalauréat,Pourcentage d'admis au baccalauréat
0,-1.669046,Autres personnes sans activité professionnelle,8844,66.4,8679,71.2,5883,73.7,23406,69.9
1,-1.669046,"Cadres, professions intellectuelles supérieure...",16326,85.7,1994,81.3,499,79.8,18819,85.1
2,-1.669046,Indéterminé,4326,61.9,4334,67.0,5922,72.5,14582,67.4
3,-1.669046,Retraités,5650,69.8,4819,72.5,3957,75.2,14426,72.1
4,-1.540658,"Artisans, commerçants, chefs d'entreprise",26226,77.5,14619,80.9,8054,80.0,48899,78.9
...,...,...,...,...,...,...,...,...,...,...
319,1.540658,Ouvriers,36419,94.1,21368,89.3,29518,81.5,87305,88.3
320,1.669046,Agriculteurs exploitants,4282,98.0,1235,95.7,1405,91.5,6922,96.2
321,1.669046,Indéterminé,11707,88.7,11158,88.6,38350,83.3,61215,85.2
322,1.669046,Ouvriers,35609,93.4,20449,87.5,27438,81.2,83496,87.6


In [68]:
def build_model(hp):
    _model = Sequential()
    
    # Tune the number of layers
    num_layers = hp.Int('num_layers', min_value=1, max_value=5)
    
    for i in range(num_layers):
        _model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), activation='relu'))
        
    _model.add(Dense(units=8, activation='linear'))
    
    # Apply constraints:
    # - For the 1st, 3rd, 5th, and 7th outputs: use ReLU to ensure positive values
    # - For the 2nd, 4th, 6th, and 8th outputs: use sigmoid scaled to [0, 100]
    _model.add(Lambda(lambda x: tf.concat([tf.nn.relu(x[:, 0:1]),               # 1st value: positive
                                           100 * tf.sigmoid(x[:, 1:2]),         # 2nd value: between 0 and 100
                                           tf.nn.relu(x[:, 2:3]),               # 3rd value: positive
                                           100 * tf.sigmoid(x[:, 3:4]),         # 4th value: between 0 and 100
                                           tf.nn.relu(x[:, 4:5]),               # 5th value: positive
                                           100 * tf.sigmoid(x[:, 5:6]),         # 6th value: between 0 and 100
                                           tf.nn.relu(x[:, 6:7]),               # 7th value: positive
                                           100 * tf.sigmoid(x[:, 7:8])], axis=1)))  # 8th value: between 0 and 100
    
    _model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='LOG')),
                   loss='mean_squared_error', metrics=['mae'])
    
    return _model

def tune(xtrain, ytrain, i):
    tuner = RandomSearch(build_model, 
                         objective='val_mae',
                         max_trials=10,
                         directory='neural_networks',
                         project_name=f'dataviz_{i}')

    tuner.search(xtrain, ytrain, epochs=100, batch_size=32, validation_split=0.2)    
    best_net = tuner.get_best_models(num_models=1)[0]
    return best_net

def evaluate(input_data : pd.DataFrame, pred_model, scaler_) -> pd.DataFrame:
    input_data_copy = input_data.copy()
    input_data_copy["Année"] = scaler_.transform(input_data_copy[["Année"]])
    
    pred = pred_model.predict(input_data_copy)
    
    return pd.DataFrame(pred, columns=["Nombre d\'admis au baccalaureat général", "Pourcentage d\'admis au baccalaureat général", 
        "Nombre d\'admis au baccalauréat technologique", "Pourcentage d\'admis au baccalauréat technologique",
        "Nombre d\'admis au baccalauréat professionnel", "Pourcentage d\'admis au baccalauréat professionnel",
        "Nombre d\'admis au baccalauréat", "Pourcentage d\'admis au baccalauréat"])

In [74]:
origins = list(set(df_copy["Origine sociale"].values.tolist()))
final_df = df.copy()
last_num_ligne = df.shape[0] + 1

def predict(i):
    global last_num_ligne, final_df
    
    df_ensemble = df_copy[df_copy["Origine sociale"] == origins[i]]
    X = df_ensemble["Année"]
    y = df_ensemble.drop(["Année", "Origine sociale"], axis=1)
    X_train, y_train = X, y

    best_model = tune(X_train, y_train, i)
    input_2024_to_2030 = pd.read_csv("fr-en-reussite-prediction-2025-to-2030-input.csv", encoding="utf-8", delimiter=';')
    input_2024_to_2030 = input_2024_to_2030[input_2024_to_2030["Origine sociale"] == origins[i]]
    input_copy = input_2024_to_2030.copy()
    input_copy.reset_index(drop=True, inplace=True)
    input_2024_to_2030 = input_2024_to_2030.drop("Origine sociale", axis=1)
    prediction = evaluate(input_2024_to_2030, best_model, scaler)
    prediction.to_csv(f"fr-en-reussite-au-bac-pred-2024-2030-{i}.csv")
    
    new_num_ligne = pd.DataFrame([i for i in range(last_num_ligne, last_num_ligne + prediction.shape[0])], columns=["num_ligne"])
    final_df = pd.concat([final_df, pd.concat([new_num_ligne, input_copy, prediction], axis=1)], axis=0) 
    last_num_ligne += prediction.shape[0]

In [75]:
for i in range(len(origins)):
    predict(i)

final_df.to_csv("fr-en-reussite-bac-plus-prevision-2025-to-2030.csv")

Trial 10 Complete [00h 00m 14s]
val_mae: 7703.966796875

Best val_mae So Far: 5336.9765625
Total elapsed time: 00h 02m 38s
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step


  saveable.load_own_variables(weights_store.get(inner_path))


In [93]:
input_ = pd.read_csv("fr-en-reussite-prediction-2025-to-2030-input.csv", encoding="utf-8", delimiter=';')
last_num_ligne = df.shape[0] + 1
final_df = df.copy()
for i in range(len(origins)):    
    input_2 = input_[input_["Origine sociale"] == origins[i]]
    input_2.reset_index(drop=True, inplace=True)
    pred_i = pd.read_csv(f"fr-en-reussite-au-bac-pred-2024-2030-{i}.csv", encoding="utf-8", delimiter=',')
    new_num_ligne = pd.DataFrame([i for i in range(last_num_ligne, last_num_ligne + pred_i.shape[0])], columns=["num_ligne"])
    final_df = pd.concat([final_df, pd.concat([new_num_ligne, input_2, pred_i], axis=1)], axis=0) 
    last_num_ligne += pred_i.shape[0]
final_df.to_csv("fr-en-reussite-bac-plus-prevision-2025-to-2030.csv")

In [95]:
dddf = pd.read_csv("fr-en-reussite-bac-plus-prevision-2025-to-2030.csv", delimiter=',', encoding="utf8")
dddf.describe()

Unnamed: 0,num_ligne,Année,Nombre d'admis au baccalaureat général,Pourcentage d'admis au baccalaureat général,Nombre d'admis au baccalauréat technologique,Pourcentage d'admis au baccalauréat technologique,Nombre d'admis au baccalauréat professionnel,Pourcentage d'admis au baccalauréat professionnel,Nombre d'admis au baccalauréat,Pourcentage d'admis au baccalauréat
count,408.0,408.0,408.0,408.0,408.0,408.0,408.0,408.0,408.0,408.0
mean,204.5,2013.5,56016.681743,89.553676,26600.771313,87.371569,22242.742111,83.752941,95693.167127,86.508824
std,117.923704,9.822754,89283.01812,8.91231,49637.123313,9.236464,35000.831405,14.16391,144382.600988,14.36335
min,1.0,1997.0,0.0,61.9,0.0,66.5,0.0,0.0,0.0,0.0
25%,102.75,2005.0,7779.20555,83.5,2496.25,80.0,2949.75,79.1,15738.25,81.175
50%,204.5,2013.5,25230.0,90.9,12583.0,87.25,11351.5,83.0,54828.0,87.7
75%,306.25,2022.0,50525.75,97.75,24475.25,94.925,23871.25,90.025,96361.0,95.025
max,408.0,2030.0,481109.75,100.0,343618.53,100.0,190899.0,100.0,722971.0,100.0
