In [90]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Chargement des données

file_path = 'C:/Users/benab/OneDrive/Bureau/ML-JEE/enhanced_investment_data1.csv'
data = pd.read_csv(file_path,encoding='ISO-8859-1')

# Afficher un aperçu des données
print("Overview of the first rows of the dataset :")
print(data.head())

Overview of the first rows of the dataset :
                             Name Gender         City  Age    Income  \
0  Constance Olivier Le LemaÃ®tre  Women        Paris   48  91061.68   
1                    AdÃ¨le Duval    Man  Montpellier   60  95493.18   
2         Simone Masson de Collin  Women         Nice   40  84344.87   
3           Gabrielle Pichon-Paul  Women         Lyon   44  59332.79   
4               Martin Le Chauvin  Women       Nantes   40  75818.07   

  Risk Tolerance            Investment History  Financial Objective  \
0           High          Actions, Obligations             Retraite   
1         Medium              ETF, Fundraising  Épargne de sécurité   
2           High       Obligations, Immobilier             Retraite   
3         Medium          Actions, Obligations             Retraite   
4           High  Fundraising, ETF, Immobilier     Achat immobilier   

  Preferred Sector Investment Frequency Recommended Domain  
0            Santé              Men

In [91]:
# 1. Vérification des informations de base sur le dataset
print("\nGeneral information about the dataset :")
print(data.info())


General information about the dataset :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  1000 non-null   object 
 1   Gender                1000 non-null   object 
 2   City                  1000 non-null   object 
 3   Age                   1000 non-null   int64  
 4   Income                1000 non-null   float64
 5   Risk Tolerance        1000 non-null   object 
 6   Investment History    1000 non-null   object 
 7   Financial Objective   1000 non-null   object 
 8   Preferred Sector      1000 non-null   object 
 9   Investment Frequency  1000 non-null   object 
 10  Recommended Domain    1000 non-null   object 
dtypes: float64(1), int64(1), object(9)
memory usage: 86.1+ KB
None


In [92]:
# 2. Vérification des valeurs manquantes
print("\nMissing values by column :")
print(data.isnull().sum())


Missing values by column :
Name                    0
Gender                  0
City                    0
Age                     0
Income                  0
Risk Tolerance          0
Investment History      0
Financial Objective     0
Preferred Sector        0
Investment Frequency    0
Recommended Domain      0
dtype: int64


In [93]:
# 3. Suppression des doublons
print("\nNumber of duplicates before deletion :", data.duplicated().sum())
data = data.drop_duplicates()
print("Number of duplicates after deletion :", data.duplicated().sum())


Number of duplicates before deletion : 0
Number of duplicates after deletion : 0


In [94]:
# 4. Nettoyage des chaînes de caractères (suppression des espaces inutiles, mise en minuscule)
def clean_text_columns(df, columns):
    for col in columns:
        df[col] = df[col].str.strip().str.lower()
    return df

text_columns = ['Name', 'Gender', 'City', 'Risk Tolerance', 'Investment History',
                 'Financial Objective', 'Preferred Sector', 'Investment Frequency', 'Recommended Domain']
data = clean_text_columns(data, text_columns)

In [95]:
# 5. Harmonisation des genres
data['Gender'] = data['Gender'].replace({'women': 'Women', 'man': 'Man'})

In [96]:
# 6. Affichage des catégories uniques dans certaines colonnes pour vérification
print("\nUnique Categories for Gender :")
print(data['Gender'].unique())


Unique Categories for Gender :
['Women' 'Man']


In [97]:
print("\nUnique Categories for Risk Tolerance :")
print(data['Risk Tolerance'].unique())


Unique Categories for Risk Tolerance :
['high' 'medium' 'low']


In [98]:
print("\nUnique Categories for Investment Frequency :")
print(data['Investment Frequency'].unique())


Unique Categories for Investment Frequency :
['mensuel' 'trimestriel' 'annuel']


In [99]:
print("\nUnique Categories for Recommended Domain :")
print(data['Recommended Domain'].unique())


Unique Categories for Recommended Domain :
['actions' 'obligations' 'immobilier' 'etf' 'cryptomonnaies' 'startups']


In [100]:
# 7. Encodage optionnel (si nécessaire pour le Machine Learning)
from sklearn.preprocessing import LabelEncoder


encoder = LabelEncoder()
encoded_columns = ['Gender', 'Risk Tolerance', 'Financial Objective', 'Preferred Sector', 'Investment Frequency', 'Recommended Domain']
for col in encoded_columns:
    data[col] = encoder.fit_transform(data[col])

In [101]:
# 8. Résumé statistique des colonnes numériques
print("\nStatistical summary of numeric columns :")
print(data.describe())


Statistical summary of numeric columns :
            Gender          Age         Income  Risk Tolerance  \
count  1000.000000  1000.000000    1000.000000     1000.000000   
mean      0.513000    42.039000   61025.692030        1.324000   
std       0.500081    11.038357   28184.942734        0.941228   
min       0.000000    25.000000    5843.110000        0.000000   
25%       0.000000    34.000000   40586.727500        0.000000   
50%       1.000000    42.000000   56614.175000        2.000000   
75%       1.000000    50.000000   76320.660000        2.000000   
max       1.000000    70.000000  164195.870000        2.000000   

       Financial Objective  Preferred Sector  Investment Frequency  \
count          1000.000000        1000.00000           1000.000000   
mean              2.619000           3.38900              1.716000   
std               1.760949           1.80913              0.582824   
min               0.000000           0.00000              0.000000   
25%          

In [102]:
# 1. Vérification des informations de base sur le dataset
print("\nGeneral information about the dataset :")
print(data.info())


General information about the dataset :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Name                  1000 non-null   object 
 1   Gender                1000 non-null   int32  
 2   City                  1000 non-null   object 
 3   Age                   1000 non-null   int64  
 4   Income                1000 non-null   float64
 5   Risk Tolerance        1000 non-null   int32  
 6   Investment History    1000 non-null   object 
 7   Financial Objective   1000 non-null   int32  
 8   Preferred Sector      1000 non-null   int32  
 9   Investment Frequency  1000 non-null   int32  
 10  Recommended Domain    1000 non-null   int32  
dtypes: float64(1), int32(6), int64(1), object(3)
memory usage: 62.6+ KB
None


In [103]:
# 1. Suppression de la colonne "Nom" (non utile pour le ML)
data = data.drop(columns=['Name'])

# 2. Encodage des villes (Label Encoding)
data['City'] = encoder.fit_transform(data['City'])

In [104]:
# Liste des types d'investissements possibles
investment_types = set()
data['Investment History'].apply(lambda x: investment_types.update(x.split(',')))

# Nettoyer et ordonner les types d'investissement
investment_types = sorted([inv_type.strip().lower() for inv_type in investment_types])

# Encoder la colonne "Historique d'Investissement" en une chaîne binaire
def encode_investments(investment_string, investment_types):
    # Convertir en binaire selon les types d'investissement
    investment_string = investment_string.lower()
    return ''.join(['1' if inv_type in investment_string else '0' for inv_type in investment_types])

# Appliquer l'encodage dans la même colonne
data['Investment History'] = data['Investment History'].apply(lambda x: encode_investments(x, investment_types))

print("\nData preview after encoding 'Investment History' :")
print(data.head())



Data preview after encoding 'Investment History' :
   Gender  City  Age    Income  Risk Tolerance  Investment History  \
0       1     7   48  91061.68               0  110000000000001100   
1       0     4   60  95493.18               2  000011110000000000   
2       1     6   40  84344.87               0  000000001100001100   
3       1     2   44  59332.79               2  110000000000001100   
4       1     5   40  75818.07               0  000011111100000000   

   Financial Objective  Preferred Sector  Investment Frequency  \
0                    2                 5                     1   
1                    6                 4                     2   
2                    2                 2                     1   
3                    2                 3                     2   
4                    0                 5                     2   

   Recommended Domain  
0                   0  
1                   4  
2                   0  
3                   3  
4         

In [105]:
# 1. Vérification des informations de base sur le dataset
print("\nGeneral information about the dataset :")
print(data.info())


General information about the dataset :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Gender                1000 non-null   int32  
 1   City                  1000 non-null   int32  
 2   Age                   1000 non-null   int64  
 3   Income                1000 non-null   float64
 4   Risk Tolerance        1000 non-null   int32  
 5   Investment History    1000 non-null   object 
 6   Financial Objective   1000 non-null   int32  
 7   Preferred Sector      1000 non-null   int32  
 8   Investment Frequency  1000 non-null   int32  
 9   Recommended Domain    1000 non-null   int32  
dtypes: float64(1), int32(7), int64(1), object(1)
memory usage: 50.9+ KB
None


In [106]:
# Encodage de la colonne cible avec LabelEncoder
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['Recommended Domain'] = encoder.fit_transform(data['Recommended Domain'])

# Affichage des correspondances (catégorie -> valeur numérique)
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Mapping encoded values :", label_mapping)


Mapping encoded values : {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}


In [107]:
# 9. Sauvegarde du dataset nettoyé
#cleaned_file_path = "newdata1.csv"
#data.to_csv(cleaned_file_path, index=False)
#print(f"\nDataset nettoyé sauvegardé sous : {cleaned_file_path}")

In [108]:
# Trouver les colonnes de type 'object'
object_columns = data.select_dtypes(include=['object']).columns

# Initialiser un dictionnaire pour sauvegarder les mappings
encoders = {}
encoded_data = data.copy()

# Encoder chaque colonne de type 'object'
for col in object_columns:
    encoder = LabelEncoder()
    encoded_data[col] = encoder.fit_transform(encoded_data[col])
    encoders[col] = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))  # Sauvegarder le mapping pour chaque colonne

# Affichage des encodages réalisés
print("\nEncoded Column Mappings :")
for col, mapping in encoders.items():
    print(f"\n{col} : {mapping}")
# Aperçu des données encodées
print("\nPreview of the first lines of the encoded data :")
print(encoded_data.head())
# Sauvegarder le dataset encodé si besoin
#encoded_file_path = "encoded_user_investment_data.csv"
#encoded_data.to_csv(encoded_file_path, index=False)
#print(f"\nDataset encodé sauvegardé sous : {encoded_file_path}")



Encoded Column Mappings :

Investment History : {'000000000000000011': 0, '000000000000001100': 1, '000000000000001111': 2, '000000000000011111': 3, '000000000000100000': 4, '000000000000100011': 5, '000000000000101100': 6, '000000000011001100': 7, '000000000011010000': 8, '000000000011100000': 9, '000000000011101100': 10, '000000001100000000': 11, '000000001100000011': 12, '000000001100001100': 13, '000000001100001111': 14, '000000001100011100': 15, '000000001100100000': 16, '000000001111000000': 17, '000000001111001100': 18, '000000110000000000': 19, '000000110000000011': 20, '000000110000001100': 21, '000000110000001111': 22, '000000110000100000': 23, '000000110011000000': 24, '000000111100000000': 25, '000000111100001100': 26, '000011000000000000': 27, '000011000000000011': 28, '000011000000001100': 29, '000011000000001111': 30, '000011000000011111': 31, '000011000000100000': 32, '000011000000101100': 33, '000011000011000000': 34, '000011000011000011': 35, '000011000011001100': 36

In [109]:
# 1. Vérification des informations de base sur le dataset
print("\nGeneral information about the dataset :")
print(data.info())


General information about the dataset :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Gender                1000 non-null   int32  
 1   City                  1000 non-null   int32  
 2   Age                   1000 non-null   int64  
 3   Income                1000 non-null   float64
 4   Risk Tolerance        1000 non-null   int32  
 5   Investment History    1000 non-null   object 
 6   Financial Objective   1000 non-null   int32  
 7   Preferred Sector      1000 non-null   int32  
 8   Investment Frequency  1000 non-null   int32  
 9   Recommended Domain    1000 non-null   int64  
dtypes: float64(1), int32(6), int64(2), object(1)
memory usage: 54.8+ KB
None


In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import warnings
from imblearn.over_sampling import SMOTE


# Ignorer les warnings
warnings.filterwarnings("ignore")

# Séparation des variables explicatives (X) et de la cible (y)
X = data.drop(columns=['Recommended Domain'])  # Features
y = data['Recommended Domain']  # Target

# Appliquer SMOTE pour équilibrer les classes
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

print(f"Number of lines after balancing : {len(y_balanced)}")
print(f"Class distribution after SMOTE : {y_balanced.value_counts()}")

# Encoder la colonne "Historique d'Investissement"
encoder = LabelEncoder()
X['Investment History'] = encoder.fit_transform(X['Investment History'])

# Division en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",  # Pondération automatique des classes
    random_state=42
)

rf_model.fit(X_train, y_train)

# Prédictions
y_pred = rf_model.predict(X_test)

# Évaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest - Precision : {accuracy * 100:.2f}%")


Number of lines after balancing : 1824
Class distribution after SMOTE : Recommended Domain
0    304
4    304
3    304
2    304
1    304
5    304
Name: count, dtype: int64


Random Forest - Precision : 84.00%


In [111]:
import pickle

# Enregistrer le modèle entraîné
with open("projetML.pkl", "wb") as f:
    pickle.dump(rf_model, f)

print("Model saved in 'projetML.pkl'")

Model saved in 'projetML.pkl'


In [112]:
print(y_train.value_counts())


Recommended Domain
2    241
3    225
4    167
0    150
5      9
1      8
Name: count, dtype: int64
