# Charger et mettre en forme les données à partir d'un fichier csv

In [None]:
import pandas 

noms = ['seismic','seismoacoustic','shift','genergy','gpuls','gdenergy','gdpuls','ghazard','nbumps','nbumps2','nbumps3', 'nbumps4','nbumps5','nbumps6','nbumps7','nbumps89','energy','maxenergy','class']
dataframe = pandas.read_csv('seismic-bumps.csv', names = noms)
dataframe

In [None]:
donnees = dataframe.values
d = donnees[:, 0:18]
t = donnees[:, 18]

# Transformer les attributs catégoriels en un ensemble de variables binaires

In [None]:
import numpy as np

def binarisation(d, indice, valeurs):
    B = np.zeros((len(d), len(valeurs)))
    for i in range(len(d)):
        for v in range(len(valeurs)):
            if(d[i,indice] == valeurs[v]):
                B[i,v] = 1
    return B 


A0 = binarisation(d, 0,['a','b','c','d'])
A1 = binarisation(d, 1,['a','b','c','d'])
A2 = binarisation(d, 2,['W','N'])
A7 = binarisation(d, 7,['a','b','c','d'])

X = np.concatenate((A0, A1, A2, d[:,3:7],A7, d[:,8:18]), axis=1)
Y = t.astype('int')

In [None]:
pandas.DataFrame(data = X)

# Données deséquilibrées

In [None]:
len(X[Y == 0]), len(X[Y == 1])

# Ajustement du modèle baseline où l'on prédit toujours la classe 0

In [None]:
len(X[Y == 0])/ (len(X[Y == 1]) + len(X[Y == 0]))

# Apprentissage du modèle

In [None]:
import random 
from sklearn.model_selection import train_test_split 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = random.seed()) 

In [None]:
from sklearn.ensemble import RandomForestClassifier
modele = RandomForestClassifier(n_estimators = 10)
modele.fit(X_train, Y_train)
Y_predit = modele.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_predit)  

In [None]:
accuracy

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, Y_predit)

In [None]:
from sklearn.metrics import f1_score
f1_score(Y_test, Y_predit)

# Sous-échantillonnage de la classe majoritaire et sur-pondération

In [None]:
nbs = 170
from sklearn.utils import resample
echantillon_majorite = resample(X[Y == 0], 
                                 replace = False,    # echantillon sans remise
                                 n_samples = nbs,    # pour en avoir autant que dans la classe minoritaire
                                 random_state = random.seed()) 
# On combine la classe minoritaire avec le sous-echantillon de la classe majoritaire
X1 = np.concatenate((echantillon_majorite, X[Y == 1]), axis = 0)
Y1 = np.concatenate((np.zeros(nbs), np.ones(170)), axis = 0)

In [None]:
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size = 0.3, random_state = random.seed()) 
    
modele.fit(X1_train, Y1_train)
Y1_predit = modele.predict(X1_test)

In [None]:
f1_score(Y1_test, Y1_predit)

In [None]:
confusion_matrix(Y1_test, Y1_predit)

# Surpondération de la classe sous-échantillonnée

In [None]:
poids = (2414  / nbs)
p = (1-Y1_test) * poids
p = p + Y1_test
f1_score(Y1_test, Y1_predit, sample_weight = p)

In [None]:
confusion_matrix(Y1_test, Y1_predit, sample_weight = p)