In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

In [2]:
# Charger le fichier CSV
pos_cash_balance = pd.read_csv(r'C:\Users\jason\Desktop\Cours MS2D\Bejaoui\IA\Credit scoring project\POS_CASH_balance.csv')

In [3]:
# Afficher les premières lignes du DataFrame
print(pos_cash_balance.head())

   SK_ID_PREV  SK_ID_CURR  MONTHS_BALANCE  CNT_INSTALMENT  \
0     1803195      182943             -31            48.0   
1     1715348      367990             -33            36.0   
2     1784872      397406             -32            12.0   
3     1903291      269225             -35            48.0   
4     2341044      334279             -35            36.0   

   CNT_INSTALMENT_FUTURE NAME_CONTRACT_STATUS  SK_DPD  SK_DPD_DEF  
0                   45.0               Active       0           0  
1                   35.0               Active       0           0  
2                    9.0               Active       0           0  
3                   42.0               Active       0           0  
4                   35.0               Active       0           0  


In [4]:
# Sélectionner les colonnes pertinentes
selected_columns_pos_cash_balance = [
    'SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'CNT_INSTALMENT',
    'CNT_INSTALMENT_FUTURE', 'NAME_CONTRACT_STATUS', 'SK_DPD', 'SK_DPD_DEF'
]

pos_cash_balance_selected = pos_cash_balance[selected_columns_pos_cash_balance].copy()

# Nombre de lignes avant le filtrage des valeurs aberrantes
print(f"Nombre de lignes avant le filtrage des valeurs aberrantes : {len(pos_cash_balance_selected)}")

# Afficher le nombre de valeurs manquantes par colonne
print(pos_cash_balance_selected.isnull().sum())

Nombre de lignes avant le filtrage des valeurs aberrantes : 10001358
SK_ID_PREV                   0
SK_ID_CURR                   0
MONTHS_BALANCE               0
CNT_INSTALMENT           26071
CNT_INSTALMENT_FUTURE    26087
NAME_CONTRACT_STATUS         0
SK_DPD                       0
SK_DPD_DEF                   0
dtype: int64


In [5]:
# Traiter les valeurs manquantes en utilisant .loc
pos_cash_balance_selected.loc[:, 'CNT_INSTALMENT'] = pos_cash_balance_selected['CNT_INSTALMENT'].fillna(pos_cash_balance_selected['CNT_INSTALMENT'].median())
pos_cash_balance_selected.loc[:, 'CNT_INSTALMENT_FUTURE'] = pos_cash_balance_selected['CNT_INSTALMENT_FUTURE'].fillna(pos_cash_balance_selected['CNT_INSTALMENT_FUTURE'].median())

In [6]:
# Sélectionner les colonnes de type 'object' et obtenir le nombre de valeurs uniques
object_columns_unique_counts = pos_cash_balance_selected.select_dtypes('object').apply(pd.Series.nunique, axis=0)
print(object_columns_unique_counts)

# Afficher les valeurs uniques pour chaque colonne de type 'object'
for column in pos_cash_balance_selected.select_dtypes('object').columns:
    print(f"Valeurs uniques pour {column}:")
    print(pos_cash_balance_selected[column].unique())
    print()

NAME_CONTRACT_STATUS    9
dtype: int64
Valeurs uniques pour NAME_CONTRACT_STATUS:
['Active' 'Completed' 'Signed' 'Approved' 'Returned to the store' 'Demand'
 'Canceled' 'XNA' 'Amortized debt']



In [7]:
# 4. Corriger les valeurs de NAME_CONTRACT_STATUS
# Remplacer 'XNA' par la valeur la plus fréquente
most_frequent_status = pos_cash_balance_selected['NAME_CONTRACT_STATUS'].mode()[0]
pos_cash_balance_selected['NAME_CONTRACT_STATUS'] = pos_cash_balance_selected['NAME_CONTRACT_STATUS'].replace('XNA', most_frequent_status)

# Vérification des valeurs uniques après remplacement
object_columns_unique_counts_corrected = pos_cash_balance_selected.select_dtypes('object').apply(pd.Series.nunique, axis=0)
print(object_columns_unique_counts_corrected)

NAME_CONTRACT_STATUS    8
dtype: int64


In [8]:
# 5. Transformation logarithmique pour les colonnes numériques pertinentes
columns_to_log_transform = ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'SK_DPD', 'SK_DPD_DEF']

for col in columns_to_log_transform:
    pos_cash_balance_selected[col] = pos_cash_balance_selected[col].apply(lambda x: np.log1p(x) if x > 0 else 0)


In [9]:
# 6. Imputation des valeurs aberrantes
numerical_cols_pos_cash_balance = ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'SK_DPD', 'SK_DPD_DEF']

for col in numerical_cols_pos_cash_balance:
    q_low = pos_cash_balance_selected[col].quantile(0.01)
    q_high = pos_cash_balance_selected[col].quantile(0.99)
    
    median_value = pos_cash_balance_selected[col].median()
    pos_cash_balance_selected[col] = pos_cash_balance_selected[col].mask((pos_cash_balance_selected[col] < q_low) | (pos_cash_balance_selected[col] > q_high), median_value)

In [10]:
# 7. Réduction de la taille des données pour l'Isolation Forest
pos_cash_balance_sampled = pos_cash_balance_selected.sample(frac=0.1, random_state=42)

In [11]:
# 8. Application de l'Isolation Forest pour la détection des anomalies
iso = IsolationForest(contamination=0.01, random_state=42)
yhat = iso.fit_predict(pos_cash_balance_sampled[numerical_cols_pos_cash_balance])

pos_cash_balance_sampled = pos_cash_balance_sampled[yhat != -1]

In [12]:
# Vérifier les types de données
print(pos_cash_balance_selected.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001358 entries, 0 to 10001357
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SK_ID_PREV             int64  
 1   SK_ID_CURR             int64  
 2   MONTHS_BALANCE         int64  
 3   CNT_INSTALMENT         float64
 4   CNT_INSTALMENT_FUTURE  float64
 5   NAME_CONTRACT_STATUS   object 
 6   SK_DPD                 float64
 7   SK_DPD_DEF             float64
dtypes: float64(4), int64(3), object(1)
memory usage: 610.4+ MB
None


In [13]:
# 9. Fusionner les résultats
pos_cash_balance_selected = pd.concat([pos_cash_balance_selected[~pos_cash_balance_selected.index.isin(pos_cash_balance_sampled.index)], pos_cash_balance_sampled])

In [14]:
# Nombre de lignes après le filtrage des valeurs aberrantes
print(f"Nombre de lignes après le filtrage des valeurs aberrantes : {len(pos_cash_balance_selected)}")

Nombre de lignes après le filtrage des valeurs aberrantes : 10001358


In [15]:
# Vérifier que toutes les valeurs manquantes ont été traitées
print(pos_cash_balance_selected.isnull().sum())

SK_ID_PREV               0
SK_ID_CURR               0
MONTHS_BALANCE           0
CNT_INSTALMENT           0
CNT_INSTALMENT_FUTURE    0
NAME_CONTRACT_STATUS     0
SK_DPD                   0
SK_DPD_DEF               0
dtype: int64


In [16]:
# Sauvegarder les données nettoyées
pos_cash_balance_selected.to_csv(r'C:\Users\jason\Desktop\Cours MS2D\Bejaoui\IA\Credit scoring project\POS_CASH_balance_clean.csv', index=False)