In [1]:
# A l'issue du NB 3, notre dataset compte 49 variables. Toutes ne peuvent pas être traitées et présentées en DataViz.
# Objectif de ce notebook 4 : extraire les variables les plus significatives que nous présenterons en Dataviz ().
# Méthode suggérée par Alban : faire un remplacement de Nan par la médiane pour toutes les variables numériques 
# et par le mode pour toutes les variables catégorielles.
# Puis après prétraitement (dichotomisation, standardisation...), lancer des modèles de classification 
# et extraire les variables les plus importantes.
# A l'issue de ce notebook, on note que les modèles ne sont par performants.
# On décide de retravailler chaque variable une par une, en même temps que nous faisons la DataViz (voir NB suivant)


In [2]:
import pandas as pd
import numpy as np


In [3]:
df_selec=pd.read_csv("df_selec.csv", index_col="idmutation")

In [4]:
df_selec.head()

Unnamed: 0_level_0,y,idnatmut,libnatmut,vefa,valeurfonc,nblot,nbcomm,l_codinsee,nblocapt,nblocdep,...,georad2020_alea,igntop202103_bat_hauteur,mcumer202007_is_mer,adedpe202006_logtype_s_hab,baie_orientation_sud,baie_orientation_nord,baie_orientation_ouest,baie_orientation_est,baie_orientation_autre,Commerces_bruyants
idmutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7819809,0,1,Vente,f,370000.0,1,1,75101,1,1,...,Faible,20.2,1.0,75.0,1,0,0,0,0,1
7819819,0,1,Vente,f,682000.0,2,1,75111,1,1,...,Faible,35.0,1.0,24.0,0,0,1,0,0,1
7819821,0,1,Vente,f,451000.0,1,1,75117,1,0,...,Faible,18.8,1.0,36.0,0,1,0,0,0,0
7819822,0,1,Vente,f,433000.0,3,1,75115,1,1,...,Faible,18.0,1.0,49.0,0,1,0,0,0,0
7819824,1,1,Vente,f,561076.0,1,1,75118,1,0,...,Faible,18.9,1.0,100.0,0,1,0,0,0,1


In [5]:
df_selec.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114072 entries, 7819809 to 8843184
Data columns (total 48 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   y                                            114072 non-null  int64  
 1   idnatmut                                     114072 non-null  int64  
 2   libnatmut                                    114072 non-null  object 
 3   vefa                                         114072 non-null  object 
 4   valeurfonc                                   114033 non-null  float64
 5   nblot                                        114072 non-null  int64  
 6   nbcomm                                       114072 non-null  int64  
 7   l_codinsee                                   114072 non-null  int64  
 8   nblocapt                                     114072 non-null  int64  
 9   nblocdep                                     114072 

In [6]:
# Création du train test split

In [7]:
from sklearn.model_selection import train_test_split


In [8]:
# Variables explicatives  / variable cible

X=df_selec.drop("y", axis=1)
y=df_selec["y"]

X.shape, y.shape

((114072, 47), (114072,))

In [9]:
numeric_columns=X.select_dtypes(include=np.number).columns.tolist()
numeric_columns

['idnatmut',
 'valeurfonc',
 'nblot',
 'nbcomm',
 'l_codinsee',
 'nblocapt',
 'nblocdep',
 'nbapt2pp',
 'nbapt3pp',
 'sbati',
 'sbatapt',
 'sbatact',
 'sapt2pp',
 'sapt3pp',
 'scarrez',
 'anarnc202012_nb_log',
 'anarnc202012_nb_lot_garpark',
 'anarnc202012_nb_lot_tertiaire',
 'anarnc202012_nb_lot_tot',
 'adedpe202006_logtype_enr',
 'adedpe202006_logtype_perc_surf_vitree_ext',
 'adedpe202006_logtype_presence_balcon',
 'adedpe202006_logtype_presence_climatisation',
 'ancqpv201410_is_qpv',
 'cerffo2020_nb_log',
 'igntop202103_bat_hauteur',
 'mcumer202007_is_mer',
 'adedpe202006_logtype_s_hab',
 'baie_orientation_sud',
 'baie_orientation_nord',
 'baie_orientation_ouest',
 'baie_orientation_est',
 'baie_orientation_autre',
 'Commerces_bruyants']

In [10]:
object_columns=X.select_dtypes(include='object').columns.tolist()
object_columns

['libnatmut',
 'vefa',
 'adedpe202006_logtype_baie_type_vitrage',
 'adedpe202006_logtype_ch_gen_lib_princ',
 'adedpe202006_logtype_classe_conso_ener',
 'adedpe202006_logtype_classe_estim_ges',
 'adedpe202006_logtype_traversant',
 'adedpe202006_logtype_ecs_type_ener',
 'adedpe202006_logtype_ph_pos_isol',
 'adedpe202006_logtype_type_batiment',
 'cerffo2020_annee_construction',
 'cerffo2020_usage_niveau_1_txt',
 'georad2020_alea']

In [11]:
y.value_counts()

0    92899
1    21173
Name: y, dtype: int64

In [12]:
X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.25, random_state=10)
X_train.shape, X_test.shape

((85554, 47), (28518, 47))

In [13]:
#Traitement des nan

for col in numeric_columns:
    X_train[col].fillna(X_train[col].median(), inplace=True)
    X_test[col].fillna(X_train[col].median(), inplace=True)

for col in object_columns:
    X_train[col].fillna(X_train[col].mode()[0], inplace=True)
    X_test[col].fillna(X_train[col].mode()[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [14]:
X_train.isna().sum()

idnatmut                                       0
libnatmut                                      0
vefa                                           0
valeurfonc                                     0
nblot                                          0
nbcomm                                         0
l_codinsee                                     0
nblocapt                                       0
nblocdep                                       0
nbapt2pp                                       0
nbapt3pp                                       0
sbati                                          0
sbatapt                                        0
sbatact                                        0
sapt2pp                                        0
sapt3pp                                        0
scarrez                                        0
anarnc202012_nb_log                            0
anarnc202012_nb_lot_garpark                    0
anarnc202012_nb_lot_tertiaire                  0
anarnc202012_nb_lot_

In [15]:
X_test.isna().sum()

idnatmut                                       0
libnatmut                                      0
vefa                                           0
valeurfonc                                     0
nblot                                          0
nbcomm                                         0
l_codinsee                                     0
nblocapt                                       0
nblocdep                                       0
nbapt2pp                                       0
nbapt3pp                                       0
sbati                                          0
sbatapt                                        0
sbatact                                        0
sapt2pp                                        0
sapt3pp                                        0
scarrez                                        0
anarnc202012_nb_log                            0
anarnc202012_nb_lot_garpark                    0
anarnc202012_nb_lot_tertiaire                  0
anarnc202012_nb_lot_

In [16]:
X_train.shape, X_test.shape

((85554, 47), (28518, 47))

In [17]:
X_train=pd.get_dummies(X_train)


In [18]:
X_train.shape

(85554, 113)

In [19]:
X_test=pd.get_dummies(X_test)

In [20]:
X_test.shape

(28518, 113)

In [21]:
# Différence de nombre de colonnes entre X_train et X_test => Pourquoi ? 
# J'ai gardé ces lignes de codes après retraitement des variables pour expliquer raisonnement

In [22]:
X_train_list=X_train.columns.tolist()

In [23]:
X_test_list=X_test.columns.tolist()

In [24]:
difference_train = set(X_train_list).difference(set(X_test_list))
difference_test = set(X_test_list).difference(set(X_train_list))
list_difference = list(difference_train.union(difference_test))
print(list_difference)



[]


In [25]:
# Normaliser les variables quantitatives

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])


In [26]:
#LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression

clf_lr=LogisticRegression()
clf_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [27]:
y_pred_lr=clf_lr.predict(X_test)

In [28]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_pred_lr)
cm

pd.crosstab(y_test, y_pred_lr)

col_0,0,1
y,Unnamed: 1_level_1,Unnamed: 2_level_1
0,23340,7
1,5168,3


In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.82      1.00      0.90     23347
           1       0.30      0.00      0.00      5171

    accuracy                           0.82     28518
   macro avg       0.56      0.50      0.45     28518
weighted avg       0.72      0.82      0.74     28518



In [30]:
# DECISION TREE

from sklearn.tree import DecisionTreeClassifier
clf_dt=DecisionTreeClassifier()


In [31]:

clf_dt.fit(X_train, y_train)
y_pred_dt=clf_dt.predict(X_test)

In [32]:
cm=pd.crosstab(y_test, y_pred_dt, rownames=["Classe réelle"], colnames=["Classe prédite"])
cm

Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19171,4176
1,3182,1989


In [33]:
# Affichage des meilleurs paramètres
feats = {}
for feature, importance in zip(X.columns, clf_dt.feature_importances_):
    feats[feature] = importance 
    
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Importance'})
importances.sort_values(by='Importance', ascending=False).head(8)

Unnamed: 0,Importance
sapt2pp,0.144732
libnatmut,0.128238
adedpe202006_logtype_traversant,0.071812
adedpe202006_logtype_enr,0.060295
anarnc202012_nb_lot_garpark,0.055961
adedpe202006_logtype_classe_estim_ges,0.049665
sapt3pp,0.044225
nbapt2pp,0.039423


In [34]:
clf_dt.score(X_test,y_test)

0.741987516656147

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84     23347
           1       0.32      0.38      0.35      5171

    accuracy                           0.74     28518
   macro avg       0.59      0.60      0.59     28518
weighted avg       0.76      0.74      0.75     28518



In [36]:
# Utilisation d'un RandomClassifier = 

from sklearn.ensemble import RandomForestClassifier
clf_rf=RandomForestClassifier()
clf_rf.fit(X_train, y_train)
y_pred_rf=clf_rf.predict(X_test)
pd.crosstab(y_test, y_pred_rf, rownames=["Classe réelle"], colnames=["Classe prédite"])


Classe prédite,0,1
Classe réelle,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22594,753
1,3517,1654


In [37]:
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.87      0.97      0.91     23347
           1       0.69      0.32      0.44      5171

    accuracy                           0.85     28518
   macro avg       0.78      0.64      0.68     28518
weighted avg       0.83      0.85      0.83     28518



In [38]:
feats = {}
for feature, importance in zip(X.columns, clf_rf.feature_importances_):
    feats[feature] = importance 
    
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Importance'})
importances.sort_values(by='Importance', ascending=False).head(8)

Unnamed: 0,Importance
sapt2pp,0.101234
libnatmut,0.095545
nbapt3pp,0.057122
nbapt2pp,0.05646
adedpe202006_logtype_traversant,0.054829
anarnc202012_nb_lot_garpark,0.051083
adedpe202006_logtype_enr,0.049816
adedpe202006_logtype_classe_estim_ges,0.048651
