In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

In [2]:
# Charger le fichier CSV
previous_application = pd.read_csv(r'C:\Users\jason\Desktop\Cours MS2D\Bejaoui\IA\Credit scoring project\previous_application.csv')

In [3]:
# Afficher les premières lignes du DataFrame
print(previous_application.head())

   SK_ID_PREV  SK_ID_CURR NAME_CONTRACT_TYPE  AMT_ANNUITY  AMT_APPLICATION  \
0     2030495      271877     Consumer loans     1730.430          17145.0   
1     2802425      108129         Cash loans    25188.615         607500.0   
2     2523466      122040         Cash loans    15060.735         112500.0   
3     2819243      176158         Cash loans    47041.335         450000.0   
4     1784265      202054         Cash loans    31924.395         337500.0   

   AMT_CREDIT  AMT_DOWN_PAYMENT  AMT_GOODS_PRICE WEEKDAY_APPR_PROCESS_START  \
0     17145.0               0.0          17145.0                   SATURDAY   
1    679671.0               NaN         607500.0                   THURSDAY   
2    136444.5               NaN         112500.0                    TUESDAY   
3    470790.0               NaN         450000.0                     MONDAY   
4    404055.0               NaN         337500.0                   THURSDAY   

   HOUR_APPR_PROCESS_START  ... NAME_SELLER_INDUSTRY  CN

In [4]:
# Sélectionner les colonnes pertinentes
selected_columns_previous_application = [
    'SK_ID_PREV', 'SK_ID_CURR', 'NAME_CONTRACT_TYPE', 'AMT_ANNUITY',
    'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE',
    'WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START',
    'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',
    'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED'
]

previous_application_selected = previous_application[selected_columns_previous_application].copy()

# Afficher le nombre de valeurs manquantes par colonne
print(previous_application_selected.isnull().sum())

SK_ID_PREV                           0
SK_ID_CURR                           0
NAME_CONTRACT_TYPE                   0
AMT_ANNUITY                     372235
AMT_APPLICATION                      0
AMT_CREDIT                           1
AMT_DOWN_PAYMENT                895844
AMT_GOODS_PRICE                 385515
WEEKDAY_APPR_PROCESS_START           0
HOUR_APPR_PROCESS_START              0
FLAG_LAST_APPL_PER_CONTRACT          0
NFLAG_LAST_APPL_IN_DAY               0
RATE_DOWN_PAYMENT               895844
RATE_INTEREST_PRIMARY          1664263
RATE_INTEREST_PRIVILEGED       1664263
dtype: int64


In [5]:
# Nombre de lignes avant le filtrage
print(f"Nombre de lignes avant le filtrage des valeurs aberrantes : {len(previous_application_selected)}")

Nombre de lignes avant le filtrage des valeurs aberrantes : 1670214


In [8]:
# Traiter les valeurs manquantes
# Remplacer les valeurs manquantes dans les colonnes numériques par la médiane ou des valeurs spécifiques
previous_application_selected.loc[:, 'AMT_ANNUITY'] = previous_application_selected['AMT_ANNUITY'].fillna(previous_application_selected['AMT_ANNUITY'].median())
previous_application_selected.loc[:, 'AMT_CREDIT'] = previous_application_selected['AMT_CREDIT'].fillna(previous_application_selected['AMT_CREDIT'].median())
previous_application_selected.loc[:, 'AMT_DOWN_PAYMENT'] = previous_application_selected['AMT_DOWN_PAYMENT'].fillna(0)
previous_application_selected.loc[:, 'AMT_GOODS_PRICE'] = previous_application_selected['AMT_GOODS_PRICE'].fillna(previous_application_selected['AMT_GOODS_PRICE'].median())
previous_application_selected.loc[:, 'RATE_DOWN_PAYMENT'] = previous_application_selected['RATE_DOWN_PAYMENT'].fillna(0)
previous_application_selected.loc[:, 'RATE_INTEREST_PRIMARY'] = previous_application_selected['RATE_INTEREST_PRIMARY'].fillna(previous_application_selected['RATE_INTEREST_PRIMARY'].median())
previous_application_selected.loc[:, 'RATE_INTEREST_PRIVILEGED'] = previous_application_selected['RATE_INTEREST_PRIVILEGED'].fillna(previous_application_selected['RATE_INTEREST_PRIVILEGED'].median())

In [9]:
# Analyser les colonnes de type 'object'
# Compter le nombre de valeurs uniques pour chaque colonne de type 'object'
object_columns_unique_counts = previous_application_selected.select_dtypes('object').apply(pd.Series.nunique, axis=0)
print(object_columns_unique_counts)

# Afficher les valeurs uniques pour chaque colonne de type 'object'
for column in previous_application_selected.select_dtypes('object').columns:
    print(f"Valeurs uniques pour {column}:")
    print(previous_application_selected[column].unique())
    print()

NAME_CONTRACT_TYPE             4
WEEKDAY_APPR_PROCESS_START     7
FLAG_LAST_APPL_PER_CONTRACT    2
dtype: int64
Valeurs uniques pour NAME_CONTRACT_TYPE:
['Consumer loans' 'Cash loans' 'Revolving loans' 'XNA']

Valeurs uniques pour WEEKDAY_APPR_PROCESS_START:
['SATURDAY' 'THURSDAY' 'TUESDAY' 'MONDAY' 'FRIDAY' 'SUNDAY' 'WEDNESDAY']

Valeurs uniques pour FLAG_LAST_APPL_PER_CONTRACT:
['Y' 'N']



In [10]:
# 3. Correction de 'NAME_CONTRACT_TYPE'
# Remplacer 'XNA' par la valeur majoritaire
majority_value = previous_application_selected['NAME_CONTRACT_TYPE'].mode()[0]
previous_application_selected['NAME_CONTRACT_TYPE'].replace('XNA', majority_value, inplace=True)

In [11]:
# Vérifier les valeurs uniques après correction
object_columns_unique_counts_corrected = previous_application_selected.select_dtypes('object').apply(pd.Series.nunique, axis=0)
print(object_columns_unique_counts_corrected)

NAME_CONTRACT_TYPE             3
WEEKDAY_APPR_PROCESS_START     7
FLAG_LAST_APPL_PER_CONTRACT    2
dtype: int64


In [12]:
# 5. Transformation logarithmique pour les colonnes financières
columns_to_log_transform = [
    'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 
    'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT'
]

for col in columns_to_log_transform:
    previous_application_selected[col] = previous_application_selected[col].apply(lambda x: np.log1p(x) if x > 0 else 0)

In [13]:
# 6. Imputation des valeurs aberrantes
numerical_cols_previous_application = [
    'AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 
    'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'RATE_DOWN_PAYMENT',
    'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED'
]

for col in numerical_cols_previous_application:
    q_low = previous_application_selected[col].quantile(0.01)
    q_high = previous_application_selected[col].quantile(0.99)
    
    median_value = previous_application_selected[col].median()
    previous_application_selected[col] = previous_application_selected[col].mask((previous_application_selected[col] < q_low) | (previous_application_selected[col] > q_high), median_value)

In [14]:
# 7. Réduction de la taille des données pour l'Isolation Forest
previous_application_sampled = previous_application_selected.sample(frac=0.1, random_state=42)

In [15]:
# 8. Application de l'Isolation Forest pour la détection des anomalies
iso = IsolationForest(contamination=0.01, random_state=42)
yhat = iso.fit_predict(previous_application_sampled[numerical_cols_previous_application])

previous_application_sampled = previous_application_sampled[yhat != -1]

In [16]:
# Vérifier les types de données
print(previous_application_selected.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1670214 entries, 0 to 1670213
Data columns (total 15 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   SK_ID_PREV                   1670214 non-null  int64  
 1   SK_ID_CURR                   1670214 non-null  int64  
 2   NAME_CONTRACT_TYPE           1670214 non-null  object 
 3   AMT_ANNUITY                  1670214 non-null  float64
 4   AMT_APPLICATION              1670214 non-null  float64
 5   AMT_CREDIT                   1670214 non-null  float64
 6   AMT_DOWN_PAYMENT             1670214 non-null  float64
 7   AMT_GOODS_PRICE              1670214 non-null  float64
 8   WEEKDAY_APPR_PROCESS_START   1670214 non-null  object 
 9   HOUR_APPR_PROCESS_START      1670214 non-null  int64  
 10  FLAG_LAST_APPL_PER_CONTRACT  1670214 non-null  object 
 11  NFLAG_LAST_APPL_IN_DAY       1670214 non-null  int64  
 12  RATE_DOWN_PAYMENT            1670214 non-n

In [17]:
# 9. Fusionner les résultats
previous_application_selected = pd.concat([previous_application_selected[~previous_application_selected.index.isin(previous_application_sampled.index)], previous_application_sampled])

In [18]:
# 10. Vérification finale et sauvegarde des données
print(previous_application_selected.isnull().sum())

SK_ID_PREV                     0
SK_ID_CURR                     0
NAME_CONTRACT_TYPE             0
AMT_ANNUITY                    0
AMT_APPLICATION                0
AMT_CREDIT                     0
AMT_DOWN_PAYMENT               0
AMT_GOODS_PRICE                0
WEEKDAY_APPR_PROCESS_START     0
HOUR_APPR_PROCESS_START        0
FLAG_LAST_APPL_PER_CONTRACT    0
NFLAG_LAST_APPL_IN_DAY         0
RATE_DOWN_PAYMENT              0
RATE_INTEREST_PRIMARY          0
RATE_INTEREST_PRIVILEGED       0
dtype: int64


In [19]:
# Nombre de lignes après le filtrage
print(f"Nombre de lignes après le filtrage des valeurs aberrantes : {len(previous_application_selected)}")

Nombre de lignes après le filtrage des valeurs aberrantes : 1670214


In [20]:
# Sauvegarder les données nettoyées
previous_application_selected.to_csv(r'C:\Users\jason\Desktop\Cours MS2D\Bejaoui\IA\Credit scoring project\previous_application_clean.csv', index=False)