In [188]:
import os
import random

import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score
from ast import literal_eval

os.chdir('C:\\Users\\manon\\OneDrive\\Documents\\OpenClassrooms\\Projet5')

In [2]:
data = pd.read_csv('data.csv')
dim_red = pd.read_csv('dim_red.csv')
%store -r common_tags

In [10]:
classes = [x[0] for x in common_tags.most_common(20)]

In [13]:
mlb = MultiLabelBinarizer(classes=classes)

In [25]:
data.Tags = data.Tags.apply(literal_eval)

In [26]:
targets = mlb.fit_transform(data.Tags)



In [28]:
data.Tags

0        [c#, floating-point, type-conversion, double, ...
1                         [html, css, internet-explorer-7]
2                                     [c#, .net, datetime]
3        [c#, datetime, time, datediff, relative-time-s...
4        [html, browser, timezone, user-agent, timezone...
                               ...                        
99995                      [user-interface, auto-generate]
99996                         [windows-xp, path, registry]
99997       [visual-studio-2008, msbuild, aspnet-compiler]
99998        [c#, .net, com, interop, windows-server-2008]
99999                   [windows, file-descriptor, ulimit]
Name: Tags, Length: 100000, dtype: object

In [27]:
targets

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [24]:
classes

['c#',
 '.net',
 'java',
 'asp.net',
 'c++',
 'javascript',
 'php',
 'python',
 'sql',
 'sql-server',
 'html',
 'windows',
 'c',
 'jquery',
 'mysql',
 'database',
 'visual-studio',
 'css',
 'asp.net-mvc',
 'ruby']

In [137]:
filename = 'classes.sav'
pickle.dump(classes, open(filename, 'wb'))

In [37]:
random.seed(18)

In [42]:
test_id = random.sample(range(len(data)), int(len(data)/10*3))

In [64]:
dim_red = np.array(dim_red)

In [78]:
x_train = np.delete(dim_red, test_id, 0)
y_train = np.delete(targets, test_id, 0)

In [68]:
x_test = dim_red[test_id]
y_test = targets[test_id]

### SVC

In [81]:
svc = svm.SVC()

In [82]:
%%time
svc.fit(x_train, y_train[:,0])

Wall time: 20min 47s


SVC()

In [84]:
tmp = svc.predict(x_test)

In [109]:
accuracy_score(y_test[:,0],tmp)

0.8599333333333333

In [89]:
f1_score(y_test[:,0], tmp)

0.05189530685920578

### Vu le temps d'exécution pour 1 des 20 tags à prédire, il serait préférable de ne pas utiliser la SVC

### Random Forest

In [90]:
rf0 = RandomForestClassifier()

In [91]:
%%time
rf0.fit(x_train, y_train[:,0])

Wall time: 2min 31s


RandomForestClassifier()

In [92]:
%%time
rf0_pred = rf0.predict(x_test)

Wall time: 1.71 s


In [119]:
accuracy_score(y_test[:,0], rf0_pred)

0.8615333333333334

In [93]:
f1_score(y_test[:,0], rf0_pred)

0.10589754627636677

In [100]:
rf = []
rf_pred = []

In [101]:
for i in range(20):
    exec(f'rf{i} = RandomForestClassifier()')
    exec(f'rf.append(rf{i})')

In [120]:
for i in range(20):
    rf[i].fit(x_train, y_train[:,i])
    rf_pred.append(rf[i].predict(x_test))

In [113]:
accuracy = []
f1 = []
for i in range(20):
    accuracy.append(accuracy_score(y_test[:,i],rf_pred[i]))
    f1.append(f1_score(y_test[:,i],rf_pred[i]))
    print("Groupe ", i)
    print("Accuracy : ", accuracy[i])
    print("Score f1 : ", f1[i])

Groupe  0
Accuracy :  0.8617333333333334
Score f1 :  0.10834049871023216
Groupe  1
Accuracy :  0.9040666666666667
Score f1 :  0.0027720027720027724
Groupe  2
Accuracy :  0.9504666666666667
Score f1 :  0.5917582417582418
Groupe  3
Accuracy :  0.9462333333333334
Score f1 :  0.3127396676608436
Groupe  4
Accuracy :  0.9484
Score f1 :  0.2085889570552147
Groupe  5
Accuracy :  0.9524666666666667
Score f1 :  0.20865704772475027
Groupe  6
Accuracy :  0.9588333333333333
Score f1 :  0.13937282229965156
Groupe  7
Accuracy :  0.9621
Score f1 :  0.0657354149548069
Groupe  8
Accuracy :  0.9638
Score f1 :  0.1822289156626506
Groupe  9
Accuracy :  0.9655333333333334
Score f1 :  0.10398613518197573
Groupe  10
Accuracy :  0.9738333333333333
Score f1 :  0.12874583795782463
Groupe  11
Accuracy :  0.9752666666666666
Score f1 :  0.018518518518518517
Groupe  12
Accuracy :  0.9781333333333333
Score f1 :  0.0030395136778115506
Groupe  13
Accuracy :  0.9785333333333334
Score f1 :  0.1320754716981132
Groupe  14


In [117]:
print("Accuracy moyenne : ", np.mean(accuracy))
print("Score f1 moyen : ", np.mean(f1))

Accuracy moyenne :  0.9609216666666667
Score f1 moyen :  0.15816928112901912


### Hyperparamètres : peu d'arbres au début ; les 2 critères de split; n_features none et sqrt ; 

In [187]:
%%time
criterion = ["gini", "entropy"]
n_features = ["auto", "sqrt"]
clf = []
i = 0

for crit in criterion:
    for nf in n_features:
        clf.append(MultiOutputClassifier(RandomForestClassifier(criterion=crit, max_features=nf), n_jobs=2))
        clf[i].fit(x_train, y_train)
        i+=1

Wall time: 10h 51min 1s


In [190]:
i = 0
probas = []
preds = []

for i in range(4):
    probas.append(clf[i].predict_proba(x_test))
    preds.append(clf[i].predict(x_test))

In [216]:
criterion = ["gini", "entropy"]
n_features = ["auto", "sqrt"]
i = 0

for crit in criterion:
    for nf in n_features:
        accuracy = 0
        f1 = 0
        for j in range(20):
            accuracy += accuracy_score(y_test[:,j], preds[i][:,j])
            f1 += f1_score(y_test[:,j], preds[i][:,j])
        print("Criterion ", crit, ", max_features ", nf)
        print("Accuracy moyenne : ", accuracy/20)
        print("Score f1 moyen : ", f1/20)
        print("\n")
        i +=1

Criterion  gini , max_features  auto
Accuracy moyenne :  0.9608700000000001
Score f1 moyen :  0.1574339378535558


Criterion  gini , max_features  sqrt
Accuracy moyenne :  0.9608533333333333
Score f1 moyen :  0.15639855558992172


Criterion  entropy , max_features  auto
Accuracy moyenne :  0.9610416666666666
Score f1 moyen :  0.16506903210800022


Criterion  entropy , max_features  sqrt
Accuracy moyenne :  0.9609916666666667
Score f1 moyen :  0.16395129824279311




### Le critère entropy donne de meilleurs résultats que gini. 
### Avec le critère entropy, l'accuracy est meilleure en utilisant max_features auto, mais le score f1 nous indique plutôt max_features sqrt ; c'est donc celui que nous choisirons.
### Cherchons maintenant à augmenter le nombre d'arbres de la forêt.

In [220]:
%%time
rf = MultiOutputClassifier(RandomForestClassifier(n_estimators=200, criterion="entropy", max_features="sqrt"), n_jobs=6)
rf.fit(x_train, y_train)

Wall time: 24min 5s


MultiOutputClassifier(estimator=RandomForestClassifier(criterion='entropy',
                                                       max_features='sqrt',
                                                       n_estimators=200),
                      n_jobs=6)

In [221]:
predi = rf.predict(x_test)
prob = rf.predict_proba(x_test)

In [241]:
len(predi[:,j])

30000

In [242]:
accuracy = 0
f1 = 0

for j in range(20):
    accuracy += accuracy_score(y_test[:,j], predi[:,j])
    f1 += f1_score(y_test[:,j], predi[:,j])
print("Accuracy moyenne : ", accuracy/20)
print("Score f1 moyen : ", f1/20)

Accuracy moyenne :  0.9610399999999999
Score f1 moyen :  0.16225284359211709


### On voit qu'augmenter le nombre d'arbres change peu les résultats. Nous pouvons donc en rester à ce modèle.

### Il nous faut cependant diminuer le nombre de faux positifs, ce que nous ferons en changeant le seuil.

In [237]:
seuils = np.arange(0.05, 0.5, 0.05)

for seuil in seuils:
    conf = np.array([(0,0),(0,0)])
    for i in range(20):
        predit = (prob[i][:,1] >= seuil).astype('int')
        conf = conf + confusion_matrix(y_test[:,i], predit, normalize="true")
    print(seuil)
    print(conf/20)

0.05
[[0.75892955 0.24107045]
 [0.17898305 0.82101695]]
0.1
[[0.88390513 0.11609487]
 [0.33136143 0.66863857]]
0.15000000000000002
[[0.94313879 0.05686121]
 [0.47670975 0.52329025]]
0.2
[[0.96685452 0.03314548]
 [0.57197736 0.42802264]]
0.25
[[0.98111129 0.01888871]
 [0.65820401 0.34179599]]
0.3
[[0.98873965 0.01126035]
 [0.72772285 0.27227715]]
0.35000000000000003
[[0.9934158  0.0065842 ]
 [0.78567781 0.21432219]]
0.4
[[0.99556753 0.00443247]
 [0.82645979 0.17354021]]
0.45
[[0.9971851  0.0028149 ]
 [0.86328271 0.13671729]]


### D'après les matrices de confusion, un seuil de 0.1 est indiqué.

In [243]:
filename = 'modeles.sav'
pickle.dump(rf, open(filename, 'wb'))