In [5]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from joblib import dump, load
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
import numpy as np

df = pd.read_csv('dataset_test1.csv')

#separation target et variables indépendante
X = df.drop('MIS_Status', axis=1)
y = df['MIS_Status']


cat_vars = X.select_dtypes(include=['object']).columns.tolist() + ['NewExist'] + ['UrbanRural'] + ['FranchiseBinary']
num_vars = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_vars.remove('NewExist')  
num_vars.remove('UrbanRural') 
num_vars.remove('FranchiseBinary') 


#numerical pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler', StandardScaler())
])
#categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='if_binary')),
])

#preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_vars),
        ('cat', categorical_transformer, cat_vars),
    ])


cat_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(max_depth=10, random_state=42))
])
#CatBoostClassifier(max_depth=10, random_state=42)
#XGBClassifier(n_estimators = 100, random_state=42, n_jobs=-1,max_depth=16, min_child_weight=14)

# separation des données en ensembles d'entraînement et de test, stratifier sur y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=True, test_size=0.05, random_state=42, stratify=y)

#fit sur l'ensemble d'entrainement
cat_pipeline.fit(X_train, y_train)

#obtenir le nombre de caractéristiques transformées
feature_names = cat_pipeline.named_steps['preprocessor'].get_feature_names_out()
print(f"Nombre de caractéristiques après le prétraitement : {len(feature_names)}")

#faire les predictions sur x test
y_pred = cat_pipeline.predict(X_test)

#scoring
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(accuracy)
print(report)


Learning rate set to 0.166372
0:	learn: 0.5075791	total: 181ms	remaining: 3m 1s
1:	learn: 0.4212339	total: 288ms	remaining: 2m 23s
2:	learn: 0.3584946	total: 418ms	remaining: 2m 18s
3:	learn: 0.3237095	total: 524ms	remaining: 2m 10s
4:	learn: 0.2991734	total: 636ms	remaining: 2m 6s
5:	learn: 0.2793798	total: 753ms	remaining: 2m 4s
6:	learn: 0.2579336	total: 877ms	remaining: 2m 4s
7:	learn: 0.2496270	total: 982ms	remaining: 2m 1s
8:	learn: 0.2395126	total: 1.1s	remaining: 2m 1s
9:	learn: 0.2281199	total: 1.21s	remaining: 2m
10:	learn: 0.2233658	total: 1.31s	remaining: 1m 58s
11:	learn: 0.2159745	total: 1.43s	remaining: 1m 58s
12:	learn: 0.2120344	total: 1.54s	remaining: 1m 56s
13:	learn: 0.2084827	total: 1.65s	remaining: 1m 56s
14:	learn: 0.2053736	total: 1.76s	remaining: 1m 55s
15:	learn: 0.2030110	total: 1.88s	remaining: 1m 55s
16:	learn: 0.1991830	total: 1.99s	remaining: 1m 55s
17:	learn: 0.1968052	total: 2.1s	remaining: 1m 54s
18:	learn: 0.1943829	total: 2.23s	remaining: 1m 54s
19:	

In [8]:
# Exporter le pipeline entraîné en tant que fichier .pkl
model_path = 'model.pkl'
dump(cat_pipeline, model_path)

['model.pkl']

In [9]:
#charger le modele
model = load('model.pkl')

In [10]:
model.score(X_test, y_test)

0.95393014943855

In [11]:
#la méthode predict est conçue pour faire plusieurs predictions en meme temps si on fait X_test[0] ça marche pas
model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
X_test[:10] #dix premieres lignes

Unnamed: 0,State,Zip,BankState,NAICS,Term,NoEmp,NewExist,CreateJob,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,GrAppv,SBA_Appv,Industry,FranchiseBinary
299668,CO,80012.0,SD,236115.0,267,15,1,0,0,1,1,N,N,1060000.0,749950.0,Construction,0
695773,CA,94080.0,SD,621210.0,120,3,2,0,0,1,0,,N,300000.0,225000.0,Healthcare/Social_assist,0
611574,PA,16056.0,PA,237990.0,12,3,1,4,3,1,1,Y,N,100000.0,85000.0,Construction,0
408315,IA,50320.0,SD,451220.0,60,7,2,0,0,1,0,N,N,120000.0,108000.0,Retail_trade,0
350630,IA,50021.0,IA,722110.0,84,7,1,0,0,1,1,,Y,100000.0,85000.0,Accom/Food_serv,0
653797,CA,92345.0,CA,811121.0,120,10,1,0,0,1,0,N,N,40000.0,30000.0,Other_no_pub,0
495226,MD,21401.0,MD,454311.0,1,5,1,0,0,1,2,N,N,175000.0,131250.0,Retail_trade,0
547970,MD,21206.0,CA,722110.0,84,2,1,5,2,1,0,,N,6000.0,5100.0,Accom/Food_serv,0
485649,AL,36104.0,AL,541330.0,120,11,1,0,0,1,0,N,N,200000.0,164000.0,Prof/Science/Tech,0
517410,MN,56441.0,MN,624410.0,120,24,1,0,0,1,1,,Y,88000.0,74800.0,Healthcare/Social_assist,0


In [15]:
def load_model(path='model.pkl'):
    model = load(path)
    return model

model = load_model()
model.score(X_test, y_test)

0.95393014943855

In [14]:
def prediction(model, data):
    predictions = model.predict(data)
    return predictions

prediction(model, X_test[0:1])

array([0])