# TME 2 - 2 : Inférence collective

## Partie 1 - Classifieur local

In [95]:
import pandas as pd
import numpy as np

# Constantes : 
path = 'WebKB/'
univ = 'wisconsin'
percentage_unlabeled = 0.1

# On parse les données
#path = '/Users/ACHANGER/Desktop/UPMC/FDMS-2/WebKB/'
# Fichier 'content' contient des lignes « [URL]\t[Attributs:suite de nombres]\t[Étiquette] »
content = pd.read_csv(path + 'content/'+univ+'.content', sep ='\t', header=None)
# Fichier 'cites' contient des liens « [destination] [source] »
cites   = pd.read_csv(path + 'cites/'+univ+'.cites',     sep =' ', header=None)

# Les identifiants des sites web sont leur URL :
M = len(content)
N = content.shape[1] - 2
url = content.iloc[:,0] 
labels = content.iloc[:,-1] 
attributes = content.drop([0, N+1],axis=1)

print("\n=== url dataframe ===")
print(url.head())
print("\n=== labels dataframe ===")
print(labels.head())
print("\n=== attributes dataframe ===")
print(attributes.head())


unlabeled = np.random.choice(url, size=int(M*percentage_unlabeled))
print("\nUnlabeled items:")
print(unlabeled)
example_page = np.random.choice(url)
print("Page d'exemple:", example_page, "dans unlabeled :", example_page in unlabeled)



=== url dataframe ===
0                           http://robios8.me.wisc.edu
1    http://robios8.me.wisc.edu/~lumelsky/lumelsky....
2                      http://www.cae.wisc.edu/~ece552
3                               http://www.cs.wisc.edu
4                        http://www.cs.wisc.edu/condor
Name: 0, dtype: object

=== labels dataframe ===
0    project
1    faculty
2     course
3     course
4    project
Name: 1704, dtype: object

=== attributes dataframe ===
   1     2     3     4     5     6     7     8     9     10    ...   1694  \
0     0     0     0     0     0     0     0     0     1     0  ...      0   
1     0     0     0     0     0     0     0     0     1     0  ...      0   
2     0     0     0     0     0     0     0     0     0     0  ...      0   
3     0     0     0     0     0     0     0     0     0     0  ...      0   
4     0     0     0     0     0     0     0     0     1     0  ...      0   

   1695  1696  1697  1698  1699  1700  1701  1702  1703  
0     0    

In [96]:
# Conversion des labels depuis string vers int
url_to_labels = dict(zip(url, labels))
unique_labels = sorted(np.unique(labels))
labels_to_id = dict(zip(unique_labels, range(len(unique_labels))))
url_pos = dict(zip(url, range(M)))

# Parsing du réseau depuis fichier 'cites'
network = {}
for line in cites.iterrows():
    src = line[1][1]
    dest = line[1][0]
    if src in network:
        network[src].add(dest)
    else:
        network[src] = set([dest])

print("Voisins de "+example_page+" :")
if example_page in network:
    print("(url, label, unlabeled)")
    print([(v, labels_to_id[url_to_labels[v]], v in unlabeled) for v in network[example_page]])
else:
    print("Pas de citations depuis cette page")


Voisins de http://www.cs.wisc.edu/~cs737-1/cs737.html :
(url, label, unlabeled)
[('http://www.cs.wisc.edu/~devise', 2, False)]


In [97]:
# Comptage des classes des voisins
# Dans 'count' : 1 ligne = 1 url, 1 colonne = 1 classe (classes triées par ordre alphabétique) 
count = np.zeros((M, len(unique_labels)))

# On itère sur tous les noeuds qui ont des liens sortants
for noeud, voisins in network.items():
    position = url_pos[noeud]
    for voisin in network[noeud]:
        voisin_label = labels_to_id[url_to_labels[voisin]]
        if voisin not in unlabeled:
            count[position, voisin_label] += 1
            
print("Voisinage de la page d'exemple :", count[url_pos[example_page]])

Voisinage de la page d'exemple : [ 0.  0.  1.  0.  0.]


In [117]:
# Classification

from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier 
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

X = np.concatenate((np.array(attributes),count),axis=1)
X_scale = preprocessing.scale(X)
Y = np.array([labels_to_id[lab] for lab in labels])

idx = set(range(M))
test_idx = set([url_pos[u] for u in unlabeled])
train_idx = list(idx - test_idx)
test_idx = list(test_idx)

X_train, X_test = X_scale[train_idx], X_scale[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]
true_labels = np.array([labels_to_id[lab] for lab in labels])[test_idx]


estimators = {"rf":{"clf":RandomForestClassifier(),
                    "params":{"n_estimators":np.arange(2, 100, 5),
                              "max_features":("auto", "sqrt", "log2", None),
                              "max_depth":np.arange(2, 20, 5)}},
              "svc":{"clf":SVC(),
                     "params":{"C":np.logspace(-2, 2, 5),
                              "kernel":("linear", "poly", "rbf"),
                              "class_weight":(None, "balanced")}},
              "knn":{"clf":KNeighborsClassifier(),
                     "params":{"n_neighbors":np.arange(2, 100, 5), 
                               "weights":("uniform", "distance"), 
                               "algorithm":("auto", "ball_tree", "kd_tree", "brute")}},
              "adaboost":{"clf":AdaBoostClassifier(),
                          "params":{"base_estimator":(None, RandomForestClassifier()), 
                                    "n_estimators":np.linspace(5, 50, 5, dtype=int), 
                                    "learning_rate":np.logspace(-1,1,3)}}
             }

choice = "adaboost"

best_clf = None
best_score = 0
estimators_aggreg = []
for choice in estimators.keys():
    print("Searching best "+choice+" estimator")
    gridsearch = GridSearchCV(estimators[choice]["clf"], estimators[choice]["params"], n_jobs=-1, verbose=1)
    gridsearch.fit(X_train, Y_train)
    estimators_aggreg.append((choice, gridsearch.best_estimator_))
    print("Score :", gridsearch.best_score_)
    if gridsearch.best_score_ > best_score:
        best_clf = gridsearch.best_estimator_
        best_score = gridsearch.best_score_

print("Aggrégation avec VotingClassifier")
vote_clf = VotingClassifier(estimators_aggreg)
vote_clf.fit(X_train, Y_train)
#print(clf.get_params())

if vote_clf.score(X_test, true_labels) > best_score:
    best_clf = vote_clf

print("Score sur le test set :", best_clf.score(X_test, true_labels))
print("Proportion des labels :", np.unique(Y, return_counts=True))
print("Prédiction sur le test set :", best_clf.predict(X_test))
print("Vrais labels :", true_labels)



Searching best adaboost estimator
Fitting 3 folds for each of 30 candidates, totalling 90 fits
Score : 0.8091286307053942
Searching best rf estimator
Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    4.8s finished
[Parallel(n_jobs=-1)]: Done 293 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 569 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 953 out of 960 | elapsed:   51.4s remaining:    0.4s


Score : 0.8464730290456431
Searching best knn estimator
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:   52.9s finished
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.6s


Score : 0.5684647302904564
Searching best svc estimator
Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    9.0s finished
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    4.4s finished


Score : 0.7717842323651453
Aggrégation avec VotingClassifier
Score sur le test set : 0.708333333333
Proportion des labels : (array([0, 1, 2, 3, 4]), array([ 76,  35,  22,  10, 122]))
Prédiction sur le test set : [4 1 0 0 0 4 4 4 4 4 4 0 4 4 4 2 0 1 2 0 4 4 4 4]
Vrais labels : [1 1 0 0 4 4 4 4 4 0 0 0 4 1 2 2 0 1 2 0 4 4 4 3]


## Partie 2 : ICA

On va réutiliser les données de la partie précédente pour la 1ère phase, _bootstraping_

In [118]:
from sklearn.metrics import accuracy_score

print("Estimateur choisi :", best_clf.get_params())

ordering = np.copy(test_idx)

Y[test_idx] = best_clf.predict(X_test)
old_labels = np.zeros_like(Y[test_idx]) - 1
print(Y[test_idx], true_labels)

max_iter = 5
i = 0
print(X.shape)
while not np.allclose(old_labels, Y[test_idx]) and i<max_iter:
    print("======= Itération %d =======" % i)
    # On itère sur tous les noeuds qui ont des liens sortants
    np.random.shuffle(ordering)
    for position in ordering:
        noeud = url[position]
        X[position, M:] = 0
        #print("Noeud "+noeud+":",X[position,M:])
        if noeud in network:
            for voisin in network[noeud]:
                vlabel = Y[url_pos[voisin]]
                #print("--- Nouveau voisin de label ",vlabel)
                X[position, M+vlabel] += 1
                #print("---", X[position, M:])
    old_labels = Y[test_idx]
    Y[test_idx] = best_clf.predict(X[test_idx])
    print(Y[test_idx], true_labels)
    
    i += 1
    
print("Fini.")
print("Score : ", accuracy_score(Y[test_idx], true_labels))

Estimateur choisi : {'max_leaf_nodes': None, 'warm_start': False, 'min_samples_leaf': 1, 'random_state': None, 'n_estimators': 12, 'oob_score': False, 'verbose': 0, 'max_features': None, 'class_weight': None, 'bootstrap': True, 'min_weight_fraction_leaf': 0.0, 'n_jobs': 1, 'criterion': 'gini', 'min_samples_split': 2, 'max_depth': 7, 'min_impurity_split': 1e-07}
[4 1 0 0 0 4 4 4 4 4 4 0 4 4 4 2 0 1 2 0 4 4 4 4] [1 1 0 0 4 4 4 4 4 0 0 0 4 1 2 2 0 1 2 0 4 4 4 3]
(265, 1708)
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4] [1 1 0 0 4 4 4 4 4 0 0 0 4 1 2 2 0 1 2 0 4 4 4 3]
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4] [1 1 0 0 4 4 4 4 4 0 0 0 4 1 2 2 0 1 2 0 4 4 4 3]
Fini.
Score :  0.375
