# TME 2 - 2 : Inférence collective

## Partie 1 - Classifieur local

In [40]:
import pandas as pd
import numpy as np

# Constantes : 
path = 'WebKB/'
univ = 'wisconsin'
percentage_unlabeled = 0.2

# On parse les données
#path = '/Users/ACHANGER/Desktop/UPMC/FDMS-2/WebKB/'
# Fichier 'content' contient des lignes « [URL]\t[Attributs:suite de nombres]\t[Étiquette] »
content = pd.read_csv(path + 'content/'+univ+'.content', sep ='\t', header=None)
# Fichier 'cites' contient des liens « [destination] [source] »
cites   = pd.read_csv(path + 'cites/'+univ+'.cites',     sep =' ', header=None)

# Les identifiants des sites web sont leur URL :
M = len(content)
N = content.shape[1] - 2
url = content.iloc[:,0] 
labels = content.iloc[:,-1] 
attributes = content.drop([0, N+1],axis=1)

def clean_url(u):
    return u.replace("http://", "").replace("www.", "")

url = url.apply(clean_url)
print("\n=== url dataframe ===")
print(url.head())
print("\n=== labels dataframe ===")
print(labels.head())
print("\n=== attributes dataframe ===")
print(attributes.head())


=== url dataframe ===
0                            robios8.me.wisc.edu
1    robios8.me.wisc.edu/~lumelsky/lumelsky.html
2                           cae.wisc.edu/~ece552
3                                    cs.wisc.edu
4                             cs.wisc.edu/condor
Name: 0, dtype: object

=== labels dataframe ===
0    project
1    faculty
2     course
3     course
4    project
Name: 1704, dtype: object

=== attributes dataframe ===
   1     2     3     4     5     6     7     8     9     10    ...   1694  \
0     0     0     0     0     0     0     0     0     1     0  ...      0   
1     0     0     0     0     0     0     0     0     1     0  ...      0   
2     0     0     0     0     0     0     0     0     0     0  ...      0   
3     0     0     0     0     0     0     0     0     0     0  ...      0   
4     0     0     0     0     0     0     0     0     1     0  ...      0   

   1695  1696  1697  1698  1699  1700  1701  1702  1703  
0     0     0     0     0     0     0    

In [42]:
from collections import deque
from pprint import pprint

# Conversion des labels depuis string vers int
url_to_labels = dict(zip(url, labels))
unique_labels = sorted(np.unique(labels))
labels_to_id = dict(zip(unique_labels, range(len(unique_labels))))
url_pos = dict(zip(url, range(M)))

# Parsing du réseau depuis fichier 'cites'
network = {}
for line in cites.iterrows():
    src = clean_url(line[1][1])
    dest = clean_url(line[1][0])
    if src in network:
        network[src].add(dest)
    else:
        network[src] = set([dest])
        
# Suppression de labels
remove_cluster = True
if remove_cluster:
    rand_noeud = np.random.choice(list(network.keys()))
    voisins = deque(network[rand_noeud]) # voisins est de type "queue" : FIFO
    unlabeled = set([rand_noeud])
    while len(unlabeled) < M*percentage_unlabeled and voisins: # tq 'voisin' non vide
        #print("voisins : ", voisins)
        voisin = voisins.popleft()
        unlabeled.add(voisin)
        if voisin in network:
             voisins.extend(set(network[voisin]) - unlabeled)
        if not voisins: # si 'voisin' vide
            voisins = deque([np.random.choice(list(network.keys()))])        
else:
    unlabeled = np.random.choice(url, size=int(M*percentage_unlabeled))
    print("\nUnlabeled items:")
    print(unlabeled)
    example_page = np.random.choice(url)
    print("Page d'exemple:", example_page, "dans unlabeled :", example_page in unlabeled)

print("=== unlabeled ===")
pprint(unlabeled)

example_page = np.random.choice(url)
print("Voisins de "+example_page+" :")
if example_page in network:
    print("(url, label, unlabeled)")
    print([(v, labels_to_id[url_to_labels[v]], v in unlabeled) for v in network[example_page]])
else:
    print("Pas de citations depuis cette page")


=== unlabeled ===
{'cs.wisc.edu',
 'cs.wisc.edu/condor',
 'cs.wisc.edu/condor/next.html',
 'cs.wisc.edu/coral',
 'cs.wisc.edu/exodus',
 'cs.wisc.edu/paradise',
 'cs.wisc.edu/shore',
 'cs.wisc.edu/~agupta/agupta.html',
 'cs.wisc.edu/~ashraf/ashraf.html',
 'cs.wisc.edu/~bart/bart.html',
 'cs.wisc.edu/~ben/ben.html',
 'cs.wisc.edu/~bockrath/bockrath.html',
 'cs.wisc.edu/~carey/carey.html',
 'cs.wisc.edu/~cchin/cchin.html',
 'cs.wisc.edu/~cs132-1/cs132.html',
 'cs.wisc.edu/~cs302/course.html',
 'cs.wisc.edu/~cs302/cs302.html',
 'cs.wisc.edu/~cs367-2/cs367.html',
 'cs.wisc.edu/~cs564-1/cs564.html',
 'cs.wisc.edu/~cs640-1/cs640.html',
 'cs.wisc.edu/~cs737-1/cs737.html',
 'cs.wisc.edu/~deboor/cs412.html',
 'cs.wisc.edu/~deboor/cs717.html',
 'cs.wisc.edu/~deboor/deboor.html',
 'cs.wisc.edu/~deboor/ma887.html',
 'cs.wisc.edu/~devise',
 'cs.wisc.edu/~geery/geery.html',
 'cs.wisc.edu/~guangshu/guangshu.html',
 'cs.wisc.edu/~jherro/jherro.html',
 'cs.wisc.edu/~jignesh/jignesh.html',
 'cs.wisc.edu/

In [43]:
from graphviz import Digraph

graph = Digraph()
labeled = set(url) - unlabeled
dot = Digraph(comment='The Round Table')

dot.node('A', 'King Arthur', {'color':'blue'})
dot.node('B', 'Sir Bedevere the Wise')
dot.node('L', 'Sir Lancelot the Brave')

dot.edges(['AB', 'AL'])
dot.edge('B', 'L', constraint='false')


for n in labeled:
    graph.node(n,n)
for n in unlabeled:
    graph.node(n, n, {'color':'blue'})
for noeud, voisins in network.items():
    print(noeud, voisins)
    graph.edges([(noeud, voisin) for voisin in voisins])

graph.render("img/graph")

cs.wisc.edu/~weiruc/weiruc.html {'cs.wisc.edu/~cs737-1/cs737.html', 'cs.wisc.edu'}
cs.wisc.edu/~mscalar {'cs.wisc.edu/~mscalar'}
cs.wisc.edu/~krung/krung.html {'cs.wisc.edu'}
cs.wisc.edu/~jignesh/jignesh.html {'cs.wisc.edu/paradise', 'cs.wisc.edu'}
cs.wisc.edu/~praveen/projects/seq.html {'cs.wisc.edu', 'cs.wisc.edu/~raghu/raghu.html', 'cs.wisc.edu/shore'}
cs.wisc.edu/~zhichen/zhichen.html {'cs.wisc.edu/~larus/larus.html', 'cs.wisc.edu/~bart/bart.html', 'cs.wisc.edu/~wwt', 'cs.wisc.edu/~paradyn'}
cs.wisc.edu/~hummert/hummert.html {'cs.wisc.edu/~hummert/cs110/cs110.html'}
cs.wisc.edu/~bockrath/bockrath.html {'cs.wisc.edu/~cs737-1/cs737.html', 'cs.wisc.edu', 'cs.wisc.edu/~cs132-1/cs132.html', 'cs.wisc.edu/condor'}
cs.wisc.edu/~jussi/jussi.html {'cs.wisc.edu/~pubs/faculty-info/miron.html', 'cs.wisc.edu/~devise'}
cs.wisc.edu/~ratliff/ratliff.html {'cs.wisc.edu/~ratliff/132.html'}
cs.wisc.edu/~ferris/cs525-all.html {'cs.wisc.edu/~ferris/cs733.html', 'cs.wisc.edu/~ferris/cs719.html'}
cs.wisc.

'img/graph.pdf'

In [44]:
# Comptage des classes des voisins
# Dans 'count' : 1 ligne = 1 url, 1 colonne = 1 classe (classes triées par ordre alphabétique) 
count = np.zeros((M, len(unique_labels)))

# On itère sur tous les noeuds qui ont des liens sortants
for noeud, voisins in network.items():
    position = url_pos[noeud]
    for voisin in network[noeud]:
        voisin_label = labels_to_id[url_to_labels[voisin]]
        if voisin not in unlabeled:
            count[position, voisin_label] += 1
            
print("Voisinage de la page d'exemple :", count[url_pos[example_page]])

Voisinage de la page d'exemple : [ 0.  0.  0.  0.  0.]


In [45]:
# Classification

from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier 
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

X = np.concatenate((np.array(attributes),count),axis=1)
X_scale = preprocessing.scale(X)
Y = np.array([labels_to_id[lab] for lab in labels])

idx = set(range(M))
test_idx = set([url_pos[u] for u in unlabeled])
train_idx = list(idx - test_idx)
test_idx = list(test_idx)

X_train, X_test = X_scale[train_idx], X_scale[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]
true_labels = np.array([labels_to_id[lab] for lab in labels])[test_idx]


estimators = {"rf":{"clf":RandomForestClassifier(),
                    "params":{"n_estimators":np.arange(2, 100, 10),
                              "max_features":("auto", "sqrt", "log2", None),
                              "max_depth":np.arange(2, 20, 5)}},
              "svc":{"clf":SVC(),
                     "params":{"C":np.logspace(-2, 2, 7),
                              "kernel":("linear", "poly", "rbf"),
                              "class_weight":(None, "balanced")}},
              "knn":{"clf":KNeighborsClassifier(),
                     "params":{"n_neighbors":np.arange(2, 100, 5), 
                               "weights":("uniform", "distance"), 
                               "algorithm":("auto", "ball_tree", "kd_tree", "brute")}},
              "adaboost":{"clf":AdaBoostClassifier(),
                          "params":{"base_estimator":(None, RandomForestClassifier()), 
                                    "n_estimators":np.linspace(5, 50, 5, dtype=int), 
                                    "learning_rate":np.logspace(-1,1,3)}}
             }

choice = "adaboost"

best_clf = None
best_score = 0
estimators_aggreg = []
for choice in estimators.keys():
    print("Searching best "+choice+" estimator")
    gridsearch = GridSearchCV(estimators[choice]["clf"], estimators[choice]["params"], n_jobs=-1, verbose=0)
    %time gridsearch.fit(X_train, Y_train)
    estimators_aggreg.append((choice, gridsearch.best_estimator_))
    print("Score :", gridsearch.best_score_)
    if gridsearch.best_score_ > best_score:
        best_clf = gridsearch.best_estimator_
        best_score = gridsearch.best_score_

print("Aggrégation avec VotingClassifier")
vote_clf = VotingClassifier(estimators_aggreg)
vote_clf.fit(X_train, Y_train)
#print(clf.get_params())

if vote_clf.score(X_test, true_labels) > best_score:
    best_clf = vote_clf

print("Score sur le test set :", best_clf.score(X_test, true_labels))
print("Proportion des labels :", np.unique(Y, return_counts=True))
print("Prédiction sur le test set :", best_clf.predict(X_test))
print("Vrais labels :", true_labels)



Searching best knn estimator
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.1s


CPU times: user 1.1 s, sys: 40 ms, total: 1.14 s
Wall time: 6.11 s
Score : 0.5566037735849056
Searching best rf estimator
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    5.9s finished
[Parallel(n_jobs=-1)]: Done 247 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   20.8s finished


CPU times: user 1.81 s, sys: 28 ms, total: 1.84 s
Wall time: 21.1 s
Score : 0.8443396226415094
Searching best svc estimator
Fitting 3 folds for each of 42 candidates, totalling 126 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 126 out of 126 | elapsed:    4.6s finished


CPU times: user 808 ms, sys: 36 ms, total: 844 ms
Wall time: 4.94 s
Score : 0.7830188679245284
Searching best adaboost estimator
Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    4.2s finished


CPU times: user 244 ms, sys: 24 ms, total: 268 ms
Wall time: 4.43 s
Score : 0.8113207547169812
Aggrégation avec VotingClassifier
Score sur le test set : 0.867924528302
Proportion des labels : (array([0, 1, 2, 3, 4]), array([ 76,  35,  22,  10, 122]))
Prédiction sur le test set : [1 4 4 4 2 4 4 4 4 2 4 0 1 4 1 4 4 1 4 4 0 2 0 0 2 0 1 1 0 0 0 4 0 0 0 0 1
 0 0 4 4 1 0 0 4 4 4 4 4 0 4 4 4]
Vrais labels : [1 4 4 4 2 2 2 2 0 2 4 2 1 4 1 4 4 1 4 4 0 2 0 0 2 0 1 1 0 0 0 4 0 0 0 0 1
 0 2 4 4 1 0 0 4 4 4 4 4 0 4 4 3]


## Partie 2 : ICA

On va réutiliser les données de la partie précédente pour la 1ère phase, _bootstraping_

In [55]:
from sklearn.metrics import accuracy_score

print("Estimateur choisi :", best_clf.get_params())

test_idx = sorted(test_idx)
ordering = np.copy(test_idx)

print("Unlabeled indexes : ", test_idx)
#print("Y before :", Y)
Y[test_idx] = best_clf.predict(X_test)
#print("Y after  :", Y)
old_labels = np.zeros_like(Y[test_idx]) - 1
print("Y[test_idx] :", Y[test_idx])
print("true_labels :", true_labels)

max_iter = 5
i = 0
while not np.allclose(old_labels, Y[test_idx]) and i<max_iter:
    print("======= Itération %d =======" % i)
    # On itère sur tous les noeuds qui ont des liens sortants
    np.random.shuffle(ordering)
    for position in ordering:
        noeud = url[position]
        X[position, M:] = 0
        #print("Noeud "+noeud+":",X[position,M:])
        if noeud in network:
            for voisin in network[noeud]:
                vlabel = Y[url_pos[voisin]]
                #print("--- Nouveau voisin de label ",vlabel)
                X[position, M+vlabel] += 1
                #print("---", X[position, M:])
    old_labels = Y[test_idx]
    Y[test_idx] = best_clf.predict(X[test_idx])
    print("Y[test_idx] :", Y[test_idx])
    print("true_labels :", true_labels)
    
    i += 1
    
print("Fini.")
print("Score : ", accuracy_score(Y[test_idx], true_labels))

Estimateur choisi : {'max_depth': 12, 'min_impurity_split': 1e-07, 'verbose': 0, 'n_estimators': 32, 'min_samples_split': 2, 'n_jobs': 1, 'max_leaf_nodes': None, 'class_weight': None, 'bootstrap': True, 'warm_start': False, 'max_features': None, 'criterion': 'gini', 'random_state': None, 'min_weight_fraction_leaf': 0.0, 'oob_score': False, 'min_samples_leaf': 1}
Unlabeled indexes :  [3, 4, 5, 6, 8, 9, 11, 12, 23, 25, 27, 33, 43, 44, 47, 52, 53, 56, 63, 66, 69, 77, 78, 79, 80, 81, 102, 107, 123, 125, 131, 132, 142, 171, 177, 184, 188, 190, 195, 196, 201, 227, 230, 231, 233, 234, 243, 250, 251, 253, 254, 256, 260]
Y[test_idx] : [1 4 4 4 2 4 4 4 4 2 4 0 1 4 1 4 4 1 4 4 0 2 0 0 2 0 1 1 0 0 0 4 0 0 0 0 1
 0 0 4 4 1 0 0 4 4 4 4 4 0 4 4 4]
true_labels : [1 4 4 4 2 2 2 2 0 2 4 2 1 4 1 4 4 1 4 4 0 2 0 0 2 0 1 1 0 0 0 4 0 0 0 0 1
 0 2 4 4 1 0 0 4 4 4 4 4 0 4 4 3]
Y[test_idx] : [4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4]
true_labels