In [69]:
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier

import json
import requests

API_URL = "http://127.0.0.1:5000/api"

In [43]:
response = requests.get(API_URL + "/test")
print(response.text)

API fonctionnelle


In [44]:
test_data = pd.read_csv("csv/preprocessed/app_test.csv")
json_data = test_data.head(10).to_json(orient = "records")

response = requests.post(API_URL + "/dimensions", json = {"data" : json_data})
response.json()

{'dimensions': [10, 772]}

In [49]:
request_data = {
    "data": json_data,
    "index": 5
}

response = requests.post(API_URL + "/extract", json = request_data)
pd.read_json(response.json())

Unnamed: 0,SK_ID_CURR,TARGET
0,309296,0
1,316121,0
2,220130,0
3,254432,0
4,345565,0
5,125782,0
6,433605,0
7,268903,0
8,284617,0
9,131463,1


In [70]:
def preprocess_data(df_train, df_test, selected_features) :
    
    # Extraction des features avec copie
    data = {
        "X_train" : df_train[selected_features].copy(),
        "X_test" : df_test[selected_features].copy(),
        "y_train" : df_train["TARGET"].copy(),
        "y_test" : df_test["TARGET"].copy(),
        "amt_credit_train" : df_train["AMT_CREDIT"].copy(),
        "amt_credit_test" : df_test["AMT_CREDIT"].copy()
    }
    
    # Abandon des éventuelles colonnes n'ayant qu'une seule valeur
    columns_to_drop = data["X_train"].columns[data["X_train"].nunique() == 1]
    X_train_cleaned = data["X_train"].drop(columns = columns_to_drop)
    X_test_cleaned = data["X_test"].drop(columns = columns_to_drop)
    
    # Imputation des valeurs manquantes par la médiane
    imputer = SimpleImputer(strategy = "median")
    X_train_imputed = imputer.fit_transform(X_train_cleaned)
    X_test_imputed = imputer.transform(X_test_cleaned)
    
    # Normalisation des valeurs
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)
    
    # Réattribution des noms des colonnes
    data["X_train"] = pd.DataFrame(X_train_scaled, columns = X_train_cleaned.columns)
    data["X_test"] = pd.DataFrame(X_test_scaled, columns = X_test_cleaned.columns)
    
    return data

In [71]:
def balance_data(processed_data, strategy = None) :
    
    # Copie des données fournies en argument
    data = {name : df.copy() for name, df in processed_data.items()}
    
    # Sélection du sampler suivant la stratégie d'équilibrage
    if strategy == "over" :
        sampler = RandomOverSampler(random_state = 42)        
    elif strategy == "under" :
        sampler = RandomUnderSampler(random_state = 42)
    elif strategy == "smote" :
        sampler = SMOTE(random_state = 42)
    else :
        # Simple recopie dans le cas sans équilibrage
        data["X_train_balanced"] = data["X_train"]
        data["y_train_balanced"] = data["y_train"]
        return data
    
    # Ajout temporaire des montants des crédits à X_train
    data["X_train"]["temp"] = data["amt_credit_train"].copy()
    
    # Application du sampler
    data["X_train_balanced"], data["y_train_balanced"] = sampler.fit_resample(data["X_train"], data["y_train"])
    
    # Récupération du montants des crédits
    data["amt_credit_train"] = data["X_train_balanced"].pop("temp")
    data["X_train"].drop(columns = ["temp"], inplace = True)
        
    return data

In [72]:
train_data = pd.read_csv("csv/preprocessed/app_train.csv")
train_data.shape

(246005, 772)

In [73]:
with open("selected_features.txt", "r") as file :
    lines = file.readlines()

selected_features = [line.strip() for line in lines]

In [74]:
processed_data = preprocess_data(train_data, test_data, selected_features)
balanced_data = balance_data(processed_data, strategy = "over")

In [75]:
lgbm_params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "num_leaves" : 30,
    "learning_rate" : 0.05,
    "feature_fraction" : 0.9,
    "bagging_fraction" : 0.8, 
    "bagging_freq" : 5,
    "n_estimators" : 100,
    "verbose" : -1,
    "random_state" : 42
}

lgbm_clf = LGBMClassifier(**lgbm_params)

NameError: name 'LGBMClassifier' is not defined

In [None]:
lgbm_clf.fit(balanced_data["X_train_balanced"], balanced_data["y_train_balanced"])

In [None]:
proba_preds = classifier.predict_proba(balanced_data["X_test"])[:, 1]
binary_preds = (proba_preds >= 0.5).astype(int)