In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm



from scipy import stats


from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV,cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer,make_column_selector
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier


from statsmodels.formula.api import ols
from statsmodels.api import OLS
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif



### Importation CSV et création du dataframe

In [36]:
df = pd.read_csv("data.csv")
df


Unnamed: 0,State,Industry,Real_estate,MIS_Status,Portion
0,IN,Retail trade,0,0,0.80
1,IN,Accommodation and food services,0,0,0.80
2,IN,Health care and social assistance,0,0,0.75
3,OK,Manufacturing,0,0,0.80
4,FL,Transportation and warehousing,1,0,1.00
...,...,...,...,...,...
896116,OH,Retail trade,0,0,0.80
896117,OH,Retail trade,0,0,0.50
896118,CA,Manufacturing,0,0,0.75
896119,HI,Manufacturing,0,1,0.80


### Création de la matrice X (variables explicatives) et de la série y (variable cible)

In [53]:
X = df.drop(['MIS_Status','Portion'], axis=1)
y = df['MIS_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y,shuffle=True, train_size=0.8, random_state=42)

# Score du dummy classifier
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy.score(X_test, y_test)



0.8250411493932208

### Création d'un pipeline

In [54]:
transformer = make_column_transformer(
    (OneHotEncoder(),make_column_selector(dtype_exclude=np.number)),
    (StandardScaler(),make_column_selector(dtype_include=np.number))
    )

In [64]:
a=make_pipeline(transformer).fit_transform(X_train)
a
a_columns= transformer.get_feature_names_out()
adf= pd.DataFrame(a)#.set_index(y_train.index)
# adf.columns = a_columns
adf


<716896x71 sparse matrix of type '<class 'numpy.float64'>'
	with 2150688 stored elements in Compressed Sparse Row format>

### Création d'un  RandomForest

In [65]:
forest= make_pipeline(
    transformer,
    RandomForestClassifier(
        max_depth=5,
        n_estimators=100,
    )
)

# Entrainement de notre modèle en utilisant les paramètres par défaut 
rf=RandomForestClassifier()
rf.fit(adf, y_train)

#Utilisation d'un .features_importances_ pour déterminer quelle variable est importante pour la prédiction
rf.features_importances_

# Score de notre modèle après la cross_validation
cross_val=cross_validate(forest, X_train, y_train,scoring='f1',cv=5)
cross_val['test_score'].mean()

ValueError: setting an array element with a sequence.

### Création d'une LogisticRegression

In [39]:
lo_reg = make_pipeline(
    transformer,
    LogisticRegression(
        penalty='l2',
        solver='newton-cholesky',
        class_weight='balanced',
        multi_class='ovr'
        )
    )

# Entrainement de notre modèle en utilisant les paramètres par défaut 
lo_reg.fit(X_train,y_train)

# Score de notre modèle après la cross_validation
cross_val=cross_validate(lo_reg, X_train, y_train,scoring='f1',cv=5)
cross_val['test_score'].mean()

### Utilisation d'un RandomSearch pour obtenir une idée de l'ordre de grandeur des paramètres

In [41]:
#Pour trouver le nom des paramètres
lo_reg.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f7a7d0e3220>),
                                   ('standardscaler', StandardScaler(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f7a7d0e3f70>)])),
  ('logisticregression',
   LogisticRegression(class_weight='balanced', multi_class='ovr',
                      solver='newton-cholesky'))],
 'verbose': False,
 'columntransformer': ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f7a7d0e3220>),
                                 ('standardscaler', StandardScaler(),
                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f7a7d0e3f70>)]),

In [42]:
param_rand = {'logisticregression__C':np.linspace(0,5,100),
              'logisticregression__max_iter':np.arange(0,201,2),
                }
                
random_search = RandomizedSearchCV(lo_reg, param_distributions=param_rand, n_iter=100, cv=5, random_state=42)

random_search.fit(X_train, y_train)
print(random_search.best_params_)

5 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/anas/miniconda3/envs/datascience_env/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/anas/miniconda3/envs/datascience_env/lib/python3.9/site-packages/sklearn/pipeline.py", line 406, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/anas/miniconda3/envs/datascience_env/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/home/anas/miniconda3/envs/datascience_en

{'logisticregression__max_iter': 176, 'logisticregression__C': 0.15151515151515152}


### Utilisation d'un GridSearch pour cibler les meilleurs paramètres

In [43]:
params = {'logisticregression__C':[0.05],
          'logisticregression__max_iter':np.arange(170,180,1),
            }

grid=GridSearchCV(lo_reg, param_grid=params, scoring='f1', cv=5)
grid.fit(X_train, y_train)

In [44]:
grid.best_score_

0.37606272607064106