In [53]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [2]:
bank = pd.read_csv('bank.csv')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [3]:
bank.shape

(11162, 17)

In [4]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


In [54]:
bank.select_dtypes(include='object').describe()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,deposit
count,11162,11162,11162,11162,11162,11162,11162,11162,11162,11162
unique,12,3,4,2,2,2,3,12,4,2
top,management,married,secondary,no,no,no,cellular,may,unknown,no
freq,2566,6351,5476,10994,5881,9702,8042,2824,8326,5873


In [6]:
bank.select_dtypes(exclude='object').describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0,11162.0
mean,41.231948,1528.538524,15.658036,371.993818,2.508421,51.330407,0.832557
std,11.913369,3225.413326,8.42074,347.128386,2.722077,108.758282,2.292007
min,18.0,-6847.0,1.0,2.0,1.0,-1.0,0.0
25%,32.0,122.0,8.0,138.0,1.0,-1.0,0.0
50%,39.0,550.0,15.0,255.0,2.0,-1.0,0.0
75%,49.0,1708.0,22.0,496.0,3.0,20.75,1.0
max,95.0,81204.0,31.0,3881.0,63.0,854.0,58.0


In [7]:
bank.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

In [10]:
X = bank.drop('deposit', axis=1)
y = bank['deposit']

In [18]:
# Encodage des Variables Catégorielles
encoder = LabelEncoder()
cat_df = X.select_dtypes(include='object')

for col in cat_df.columns:
  cat_df[col] = encoder.fit_transform(cat_df[col])

cat_df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
0,0,1,1,0,1,0,2,8,3
1,0,1,1,0,0,0,2,8,3
2,9,1,1,0,1,0,2,8,3
3,7,1,1,0,1,0,2,8,3
4,0,1,2,0,0,0,2,8,3


In [24]:
# Regroupement des dataframes
features = pd.concat([cat_df, X.select_dtypes(exclude='object')], axis=1, join='inner')

# Mise à l'échelle des données
scaler = StandardScaler()
features = scaler.fit_transform(features)
features

array([[-1.39160402, -0.31867191, -0.38050056, ..., -0.55416834,
        -0.48118405, -0.36325984],
       [-1.39160402, -0.31867191, -0.38050056, ..., -0.55416834,
        -0.48118405, -0.36325984],
       [ 1.39910458, -0.31867191, -0.38050056, ..., -0.55416834,
        -0.48118405, -0.36325984],
       ...,
       [ 1.39910458,  1.27998692, -0.38050056, ..., -0.18678537,
        -0.48118405, -0.36325984],
       [ 1.39910458, -0.31867191, -0.38050056, ..., -0.18678537,
         1.10957066,  1.81833188],
       [ 1.39910458, -0.31867191, -0.38050056, ..., -0.55416834,
        -0.48118405, -0.36325984]])

In [30]:
# Encodage des variables catégorielles
y_values = y.values
y_values = y_values.reshape(-1, 1)
targets = OneHotEncoder().fit_transform(y_values).toarray()
targets

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [31]:
# Séparation de nos données en Train & Test
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=41)

### Mise en place du perceptron multicouche

In [48]:
# Définition du modèle
model = Sequential()
model.add(Dense(124, activation='relu', input_shape=(X_train.shape[1], )))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 124)               2108      
                                                                 
 dropout_8 (Dropout)         (None, 124)               0         
                                                                 
 dense_16 (Dense)            (None, 64)                8000      
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_17 (Dense)            (None, 10)                650       
                                                                 
 dense_18 (Dense)            (None, 2)                 22        
                                                                 
Total params: 10,780
Trainable params: 10,780
Non-trai

In [49]:
# Compilation du modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [50]:
# Entrainement du modèle
model.fit(X_train, y_train, epochs=40, verbose=1, batch_size=32)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fe7d7493010>

In [51]:
# Evaluation du modèle
loss, accuracy = model.evaluate(X_test, y_test)
print("Perte du modèle : {:.2f}".format(loss))
print("Précision du modèle : {:.2f}%".format(accuracy * 100))

Perte du modèle : 0.40
Précision du modèle : 82.31%


In [None]:
# Prédiction sur de nouvelles données
new_data = pd.read_csv('nomfichier.csv')
X_ = scaler.fit_transform(new_data)
predictions = model.predict(X_)
print(predictions)