In [23]:
import import_ipynb
import utils
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split


In [24]:
# préparation des données pour l'entrainement de 1992 - 2020
data_pop = utils.generate_df_from_files("./data_cleaned/population", "_population")
data_fertility = utils.generate_df_from_files("./data_cleaned/fertility", "_fertility")
data_child = utils.generate_df_from_files("./data_cleaned/child_mortality", "_child_mortality")
data_capita = utils.generate_df_from_files("./data_cleaned/gdp_capita", "_gdp_capita")
data_climat = utils.generate_df_from_files("./", "climate_cleaned")

data_climat = np.array(data_climat)
data_climat = pd.DataFrame(data_climat.squeeze())
X_list = []

for annee in range(11, 19):
        
    data_pop_slice = data_pop[annee].iloc[:, :]
    data_fertility_slice = data_fertility[annee].iloc[:, 1:]
    data_child_slice = data_child[annee].iloc[:, 1:]
    data_capita_slice = data_capita[annee].iloc[:, 1:]
    data_climat_slice = data_climat.iloc[:, 1:]
    
    concatenated_data = np.concatenate((
        data_pop_slice, 
        data_fertility_slice, 
        data_child_slice, 
        data_capita_slice,
        data_climat_slice
    ), axis=1)

    X_list.append(concatenated_data)

X = np.vstack(X_list)

In [25]:
#preparation des labels pour classification 1992-->2020 labels à 3 classes
import pandas as pd

label_class_3 = pd.read_csv("labels_class_3_class.csv")
label_class_3.head()

labels_list = []
for i in range(len(label_class_3)):
    label = label_class_3.iloc[i, :-1]
    labels_list.append(label)

y_class_3 = pd.concat(labels_list, axis=0, ignore_index=True)

In [29]:
y_class_3.shape

(1144,)

In [30]:
#preparation des labels pour classification 1992-->2020 labels à 2 classes
import pandas as pd

label_class_2 = pd.read_csv("labels_class_2_class.csv")
label_class_2.head()

labels_list = []
for i in range(len(label_class_2)):
    label = label_class_2.iloc[i, :-1]
    labels_list.append(label)

y_class_2 = pd.concat(labels_list, axis=0, ignore_index=True)

In [None]:
label_class_2.unique()

In [31]:
# création des label pour tous les événements 1992->2020
import pandas as pd
df_delegations = pd.read_csv("./second_part_countries_cleaned_normalized.csv")

labels_list = []
for i in range(len(df_delegations)):
    label = df_delegations.iloc[i, :-1]
    labels_list.append(label)

y = pd.concat(labels_list, axis=0, ignore_index=True)


AttributeError: 'numpy.ndarray' object has no attribute 'unique'

In [38]:

#récupération des pays
country_names = X[:, 0]
X = X[:, 1:]
# convertion des données en float32
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)
y_class = np.array(y_class_3, dtype=np.float32) # choisir 2 ou 3 classes
# séparation des données en train et test
X_train, X_test, y_train, y_test, country_train, country_test, cl_train, cl_test = train_test_split(
    X, y, country_names, y_class,  test_size=0.2, random_state=42)



In [39]:
print(len(cl_train[ cl_train == 0]))
print(len(cl_train[ cl_train == 1]))
print(len(cl_train[ cl_train == 2]))

795
108
12


In [21]:
#show keras version
import keras
print(keras.__version__)
# and tensorflow
import tensorflow as tf 

print(tf.__version__)

3.6.0
2.18.0


In [22]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy
from tensorflow.keras.metrics import MeanSquaredError as MSE, Accuracy
from sklearn.utils.class_weight import compute_class_weight

n_folds = 5

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

regression_loss_per_fold = []
classification_loss_per_fold = []
regression_mse_per_fold = []
classification_accuracy_per_fold = []

class_weights = compute_class_weight('balanced', classes=np.unique(cl_train), y=cl_train)
class_weight_array = np.array(class_weights)



for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    cl_train_fold, cl_val_fold = cl_train[train_index], cl_train[val_index]
    
    sample_weights = np.array([class_weight_array[int(label)] for label in cl_train_fold])

    inputs = Input(shape=(X_train.shape[1],))
    x1 = Dense(10, activation='relu')(inputs)
    x2_reg = Dense(8, activation='relu')(x1)

    output_reg = Dense(1, activation='relu', name='regression_output')(x2_reg)
    output_clf = Dense(1, activation='sigmoid', name='classification_output')(x1)

    model = Model(inputs=inputs, outputs=[output_reg, output_clf])
    model.compile(
        optimizer=Adam(),
        loss={'regression_output': MeanSquaredError(), 'classification_output': BinaryCrossentropy()},
        metrics={'regression_output': 'mse', 'classification_output': Accuracy()}
    )

    history = model.fit(
        X_train_fold, 
        {'regression_output': y_train_fold, 'classification_output': cl_train_fold}, 
        epochs=40,
        batch_size=16, 
        validation_data=(X_val_fold, {'regression_output': y_val_fold, 'classification_output': cl_val_fold}),
            sample_weight=sample_weights,
        verbose=1
    )

    regression_loss_per_fold.append(history.history['regression_output_loss'])
    classification_loss_per_fold.append(history.history['classification_output_loss'])
    regression_mse_per_fold.append(history.history['regression_output_mse'])
    classification_accuracy_per_fold.append(history.history['classification_output_accuracy'])

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
for loss in regression_loss_per_fold:
    plt.plot(loss, label='Fold Regression Loss')
plt.title('Regression Loss for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
for loss in classification_loss_per_fold:
    plt.plot(loss, label='Fold Classification Loss')
plt.title('Classification Loss for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
for mse in regression_mse_per_fold:
    plt.plot(mse, label='Fold Regression MSE')
plt.title('Regression MSE (Mean Squared Error) for All Folds')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
for acc in classification_accuracy_per_fold:
    plt.plot(acc, label='Fold Classification Accuracy')
plt.title('Classification Accuracy for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

avg_val_loss = np.mean([np.min(loss) for loss in regression_loss_per_fold])
avg_val_accuracy = np.mean([np.max(acc) for acc in classification_accuracy_per_fold])

print(f'Average Validation Loss across {n_folds} folds: {avg_val_loss}')
print(f'Average Validation Accuracy across {n_folds} folds: {avg_val_accuracy}')

Epoch 1/40
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - classification_output_accuracy: 0.0000e+00 - classification_output_loss: 0.1020 - loss: 0.7673 - regression_output_loss: 0.6654 - regression_output_mse: 0.1655 - val_classification_output_accuracy: 0.0000e+00 - val_classification_output_loss: 0.1162 - val_loss: 0.7033 - val_regression_output_loss: 0.5843 - val_regression_output_mse: 0.1522
Epoch 2/40
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - classification_output_accuracy: 0.0000e+00 - classification_output_loss: 0.0841 - loss: 0.6686 - regression_output_loss: 0.5846 - regression_output_mse: 0.1496 - val_classification_output_accuracy: 0.0000e+00 - val_classification_output_loss: 0.1079 - val_loss: 0.6513 - val_regression_output_loss: 0.5416 - val_regression_output_mse: 0.1364
Epoch 3/40
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - classification_output_accuracy: 0.0000e+00 - classificati

KeyboardInterrupt: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy
from tensorflow.keras.metrics import MeanSquaredError as MSE, Accuracy
from sklearn.utils.class_weight import compute_class_weight

n_folds = 5

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

regression_loss_per_fold = []
classification_loss_per_fold = []
regression_mse_per_fold = []
classification_accuracy_per_fold = []

class_weights = compute_class_weight('balanced', classes=np.unique(cl_train), y=cl_train)
class_weight_array = np.array(class_weights)



for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    cl_train_fold, cl_val_fold = cl_train[train_index], cl_train[val_index]

    sample_weights = np.array([class_weight_array[label] for label in cl_train_fold])

    inputs = Input(shape=(X_train.shape[1],))
    x1 = Dense(10, activation='relu')(inputs)
    x2_reg = Dense(8, activation='relu')(x1)

    output_reg = Dense(1, activation='relu', name='regression_output')(x2_reg)
    output_clf = Dense(1, activation='sigmoid', name='classification_output')(x1)

    model = Model(inputs=inputs, outputs=[output_reg, output_clf])
    model.compile(
        optimizer=Adam(),
        loss={'regression_output': MeanSquaredError(), 'classification_output': BinaryCrossentropy()},
        metrics={'regression_output': 'mse', 'classification_output': Accuracy()}
    )

    history = model.fit(
        X_train_fold, 
        {'regression_output': y_train_fold, 'classification_output': cl_train_fold}, 
        epochs=40,
        batch_size=16, 
        validation_data=(X_val_fold, {'regression_output': y_val_fold, 'classification_output': cl_val_fold}),
            sample_weight=sample_weights,
        verbose=1
    )

    regression_loss_per_fold.append(history.history['regression_output_loss'])
    classification_loss_per_fold.append(history.history['classification_output_loss'])
    regression_mse_per_fold.append(history.history['regression_output_mse'])
    classification_accuracy_per_fold.append(history.history['classification_output_accuracy'])

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
for loss in regression_loss_per_fold:
    plt.plot(loss, label='Fold Regression Loss')
plt.title('Regression Loss for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
for loss in classification_loss_per_fold:
    plt.plot(loss, label='Fold Classification Loss')
plt.title('Classification Loss for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
for mse in regression_mse_per_fold:
    plt.plot(mse, label='Fold Regression MSE')
plt.title('Regression MSE (Mean Squared Error) for All Folds')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
for acc in classification_accuracy_per_fold:
    plt.plot(acc, label='Fold Classification Accuracy')
plt.title('Classification Accuracy for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

avg_val_loss = np.mean([np.min(loss) for loss in regression_loss_per_fold])
avg_val_accuracy = np.mean([np.max(acc) for acc in classification_accuracy_per_fold])

print(f'Average Validation Loss across {n_folds} folds: {avg_val_loss}')
print(f'Average Validation Accuracy across {n_folds} folds: {avg_val_accuracy}')

In [None]:

# Code à 2 classes

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError, CategoricalCrossentropy
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.utils import to_categorical


n_folds = 3
N_EP = 40

kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

regression_loss_per_fold = []
classification_loss_per_fold = []
regression_mse_per_fold = []
classification_accuracy_per_fold = []

class_weights = compute_class_weight('balanced', classes=np.unique(cl_train), y=cl_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

def weighted_loss(y_true, y_pred):
    print("y_true shape:", y_true.shape)
    print("y_pred shape:", y_pred.shape)
    
    cce_loss = CategoricalCrossentropy()(y_true, y_pred)
    
    return cce_loss

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    cl_train_fold, cl_val_fold = cl_train[train_index], cl_train[val_index]

    cl_train_fold_onehot = to_categorical(cl_train_fold, num_classes=3)
    cl_val_fold_onehot = to_categorical(cl_val_fold, num_classes=3)

    inputs = Input(shape=(X_train.shape[1],))
    first_l = Dense(16, activation='relu')(inputs) 
    second_l = Dense(8, activation='relu')(first_l)

    regression_output = Dense(1, activation='linear', name='regression_output')(second_l)
    classification_output = Dense(3, activation='softmax', name='classification_output')(second_l)
    model = Model(inputs=inputs, outputs=[regression_output, classification_output])

    model.compile(
        optimizer=Adam(),
        loss={
            'regression_output': 'mse',
            'classification_output': 'categorical_crossentropy'
        },
        metrics={
            'regression_output': ['mse'],
            'classification_output': ['accuracy']
        }
    )

    # Generate sample weights
    regression_sample_weight = generate_sample_weights(y_train_fold)
    classification_sample_weight = generate_sample_weights(cl_train_fold)

    # Train the model
    history = model.fit(
        X_train_fold,
        {
            'regression_output': y_train_fold.reshape(-1, 1),
            'classification_output': cl_train_fold_onehot
        },
        sample_weight={
            'regression_output': regression_sample_weight,
            'classification_output': classification_sample_weight
        },
        epochs=N_EP,
        batch_size=16,
        validation_data=(
            X_val_fold,
            {
                'regression_output': y_val_fold.reshape(-1, 1),
                'classification_output': cl_val_fold_onehot
            }
        ),
        verbose=1)

    regression_loss_per_fold.append(history.history['regression_output_loss'])
    classification_loss_per_fold.append(history.history['classification_output_loss'])
    regression_mse_per_fold.append(history.history['regression_output_mse'])
    classification_accuracy_per_fold.append(history.history['classification_output_accuracy'])

plt.figure(figsize=(12, 6))
for loss in regression_loss_per_fold:
    plt.plot(loss, label='Fold Regression Loss')
plt.title('Regression Loss for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(12, 6))
for loss in classification_loss_per_fold:
    plt.plot(loss, label='Fold Classification Loss')
plt.title('Classification Loss for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# ** Regression Accuracy (or MSE) **
plt.figure(figsize=(12, 6))
for mse in regression_mse_per_fold:
    plt.plot(mse, label='Fold Regression MSE')
plt.title('Regression MSE for All Folds')
plt.xlabel('Epochs')
plt.ylabel('MSE')
plt.legend()
plt.show()

# ** Classification Accuracy **
plt.figure(figsize=(12, 6))
for acc in classification_accuracy_per_fold:
    plt.plot(acc, label='Fold Classification Accuracy')
plt.title('Classification Accuracy for All Folds')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

avg_regression_loss = np.mean([np.min(loss) for loss in regression_loss_per_fold])
avg_classification_loss = np.mean([np.min(loss) for loss in classification_loss_per_fold])
avg_regression_mse = np.mean([np.min(mse) for mse in regression_mse_per_fold])
avg_classification_accuracy = np.mean([np.max(acc) for acc in classification_accuracy_per_fold])

print(f'Average Regression Loss: {avg_regression_loss}')
print(f'Average Classification Loss: {avg_classification_loss}')
print(f'Average Regression MSE: {avg_regression_mse}')
print(f'Average Classification Accuracy: {avg_classification_accuracy}')


In [None]:
# Code à 3 classes

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

# Lists to store F1 and accuracy scores for each fold
f1_scores_per_fold = []
accuracy_scores_per_fold = []

# Loop through each fold to evaluate metrics
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    cl_train_fold, cl_val_fold = cl_train[train_index], cl_train[val_index]

    # Make predictions for the classification model on the validation set
    y_pred_class = classification_model.predict(X_val_fold)
    y_pred_class = np.argmax(y_pred_class, axis=1)  # Convert probabilities to class labels (argmax for multi-class)

    # Calculate F1 score and accuracy for the current fold
    f1 = f1_score(cl_val_fold, y_pred_class, average='weighted')  # Weighted F1 for multi-class
    accuracy = accuracy_score(cl_val_fold, y_pred_class)

    # Append scores to the lists
    f1_scores_per_fold.append(f1)
    accuracy_scores_per_fold.append(accuracy)

# Calculate the mean F1 score and accuracy score across all folds
mean_f1_score = np.mean(f1_scores_per_fold)
mean_accuracy_score = np.mean(accuracy_scores_per_fold)

# Print the mean scores
print(f'Mean F1 Score across all folds: {mean_f1_score:.4f}')
print(f'Mean Accuracy Score across all folds: {mean_accuracy_score:.4f}')

# Compute confusion matrix for the final fold or average over all folds
# Using the last fold for demonstration purposes
y_pred_class = classification_model.predict(X_val_fold)
y_pred_class = np.argmax(y_pred_class, axis=1)  # Convert probabilities to class labels

# Compute the confusion matrix
cm = confusion_matrix(cl_val_fold, y_pred_class)

# Plot the confusion matrix using seaborn heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Class 0', 'Class 1', 'Class 2'], yticklabels=['Class 0', 'Class 1', 'Class 2'])
plt.title('Confusion Matrix for 3-Class Classification Task')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
# version simple du NN en format NON séquentiel
"""

inputs = Input(shape=(X_train.shape[1],))

x = Dense(50, activation='relu')(inputs)
#x = Dropout(0.3)(x)

#x = Dense(16, activation='relu')(x)
#x = Dropout(0.3)(x)

x = Dense(12, activation='relu')(x)

outputs = Dense(1)(x)

model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

history = model.fit(X_train, y_train, epochs=80, batch_size=32, validation_split=0.2, verbose=1)
"""