In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

In [2]:
# Charger le fichier CSV
credit_card_balance = pd.read_csv(r'C:\Users\jason\Desktop\Cours MS2D\Bejaoui\IA\Credit scoring project\credit_card_balance.csv')

In [3]:
# Afficher les premières lignes du DataFrame
print(credit_card_balance.head())

   SK_ID_PREV  SK_ID_CURR  MONTHS_BALANCE  AMT_BALANCE  \
0     2562384      378907              -6       56.970   
1     2582071      363914              -1    63975.555   
2     1740877      371185              -7    31815.225   
3     1389973      337855              -4   236572.110   
4     1891521      126868              -1   453919.455   

   AMT_CREDIT_LIMIT_ACTUAL  AMT_DRAWINGS_ATM_CURRENT  AMT_DRAWINGS_CURRENT  \
0                   135000                       0.0                 877.5   
1                    45000                    2250.0                2250.0   
2                   450000                       0.0                   0.0   
3                   225000                    2250.0                2250.0   
4                   450000                       0.0               11547.0   

   AMT_DRAWINGS_OTHER_CURRENT  AMT_DRAWINGS_POS_CURRENT  \
0                         0.0                     877.5   
1                         0.0                       0.0   
2    

In [4]:
# Sélectionner les colonnes pertinentes
selected_columns_credit_card_balance = [
    'SK_ID_PREV', 'SK_ID_CURR', 'MONTHS_BALANCE', 'AMT_BALANCE',
    'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT',
    'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
    'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY'
]

credit_card_balance_selected = credit_card_balance[selected_columns_credit_card_balance].copy()

# Vérifier le nombre de lignes après la sélection des colonnes
print(f"Nombre de lignes après la sélection des colonnes : {len(credit_card_balance_selected)}")

# Afficher le nombre de valeurs manquantes par colonne
print(credit_card_balance_selected.isnull().sum())

Nombre de lignes après la sélection des colonnes : 3840312
SK_ID_PREV                         0
SK_ID_CURR                         0
MONTHS_BALANCE                     0
AMT_BALANCE                        0
AMT_CREDIT_LIMIT_ACTUAL            0
AMT_DRAWINGS_ATM_CURRENT      749816
AMT_DRAWINGS_CURRENT               0
AMT_DRAWINGS_OTHER_CURRENT    749816
AMT_DRAWINGS_POS_CURRENT      749816
AMT_INST_MIN_REGULARITY       305236
dtype: int64


In [5]:
# Traiter les valeurs manquantes en utilisant .loc
credit_card_balance_selected.loc[:, 'AMT_DRAWINGS_ATM_CURRENT'] = credit_card_balance_selected['AMT_DRAWINGS_ATM_CURRENT'].fillna(0)
credit_card_balance_selected.loc[:, 'AMT_DRAWINGS_OTHER_CURRENT'] = credit_card_balance_selected['AMT_DRAWINGS_OTHER_CURRENT'].fillna(0)
credit_card_balance_selected.loc[:, 'AMT_DRAWINGS_POS_CURRENT'] = credit_card_balance_selected['AMT_DRAWINGS_POS_CURRENT'].fillna(0)
credit_card_balance_selected.loc[:, 'AMT_INST_MIN_REGULARITY'] = credit_card_balance_selected['AMT_INST_MIN_REGULARITY'].fillna(credit_card_balance_selected['AMT_INST_MIN_REGULARITY'].median())

In [6]:
# 4. Transformation logarithmique pour les colonnes financières
columns_to_log_transform = [
    'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 
    'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT',
    'AMT_INST_MIN_REGULARITY'
]

for col in columns_to_log_transform:
    credit_card_balance_selected[col] = credit_card_balance_selected[col].apply(lambda x: np.log1p(x) if x > 0 else 0)


In [7]:
# 5. Imputation des valeurs aberrantes
numerical_cols_credit_card_balance = [
    'AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 
    'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT',
    'AMT_INST_MIN_REGULARITY'
]

for col in numerical_cols_credit_card_balance:
    q_low = credit_card_balance_selected[col].quantile(0.01)
    q_high = credit_card_balance_selected[col].quantile(0.99)
    
    median_value = credit_card_balance_selected[col].median()
    credit_card_balance_selected[col] = credit_card_balance_selected[col].mask((credit_card_balance_selected[col] < q_low) | (credit_card_balance_selected[col] > q_high), median_value)

In [8]:
# 6. Réduction de la taille des données pour l'Isolation Forest
credit_card_balance_sampled = credit_card_balance_selected.sample(frac=0.1, random_state=42)

In [9]:
# 7. Application de l'Isolation Forest pour la détection des anomalies
iso = IsolationForest(contamination=0.01, random_state=42)
yhat = iso.fit_predict(credit_card_balance_sampled[numerical_cols_credit_card_balance])

credit_card_balance_sampled = credit_card_balance_sampled[yhat != -1]

In [10]:
# 8. Fusionner les résultats
credit_card_balance_selected = pd.concat([credit_card_balance_selected[~credit_card_balance_selected.index.isin(credit_card_balance_sampled.index)], credit_card_balance_sampled])

In [11]:
# Vérifier les types de données
print(credit_card_balance_selected.info())

<class 'pandas.core.frame.DataFrame'>
Index: 3840312 entries, 0 to 915116
Data columns (total 10 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   SK_ID_CURR                  int64  
 2   MONTHS_BALANCE              int64  
 3   AMT_BALANCE                 float64
 4   AMT_CREDIT_LIMIT_ACTUAL     float64
 5   AMT_DRAWINGS_ATM_CURRENT    float64
 6   AMT_DRAWINGS_CURRENT        float64
 7   AMT_DRAWINGS_OTHER_CURRENT  float64
 8   AMT_DRAWINGS_POS_CURRENT    float64
 9   AMT_INST_MIN_REGULARITY     float64
dtypes: float64(7), int64(3)
memory usage: 322.3 MB
None


In [12]:
# 9. Vérification finale et sauvegarde des données
print(credit_card_balance_selected.isnull().sum())

SK_ID_PREV                    0
SK_ID_CURR                    0
MONTHS_BALANCE                0
AMT_BALANCE                   0
AMT_CREDIT_LIMIT_ACTUAL       0
AMT_DRAWINGS_ATM_CURRENT      0
AMT_DRAWINGS_CURRENT          0
AMT_DRAWINGS_OTHER_CURRENT    0
AMT_DRAWINGS_POS_CURRENT      0
AMT_INST_MIN_REGULARITY       0
dtype: int64


In [13]:
print(f"Nombre de lignes après la sélection des colonnes : {len(credit_card_balance_selected)}")

Nombre de lignes après la sélection des colonnes : 3840312


In [14]:
# Sauvegarder les données nettoyées
credit_card_balance_selected.to_csv(r'C:\Users\jason\Desktop\Cours MS2D\Bejaoui\IA\Credit scoring project\credit_card_balance_clean.csv', index=False)