# TME 2 - 2 : Inférence collective

## Partie 1 - Classifieur local

In [87]:
import pandas as pd
import numpy as np

# Constantes : 
path = 'WebKB/'
univ = 'cornell'
percentage_unlabeled = 0.1

# On parse les données
#path = '/Users/ACHANGER/Desktop/UPMC/FDMS-2/WebKB/'
# Fichier 'content' contient des lignes « [URL]\t[Attributs:suite de nombres]\t[Étiquette] »
content = pd.read_csv(path + 'content/'+univ+'.content', sep ='\t', header=None)
# Fichier 'cites' contient des liens « [destination] [source] »
cites   = pd.read_csv(path + 'cites/'+univ+'.cites',     sep =' ', header=None)

# Les identifiants des sites web sont leur URL :
M = len(content)
N = content.shape[1] - 2
url = content.iloc[:,0] 
labels = content.iloc[:,-1] 
attributes = content.drop([0, N+1],axis=1)

print("\n=== url ===")
print(url.head())
print("\n=== labels ===")
print(labels.head())
print("\n=== attributes ===")
print(attributes.head())


unlabeled = np.random.choice(url, size=int(M*percentage_unlabeled))
print("\nUnlabeled items:")
print(unlabeled)
example_page = "http://simon.cs.cornell.edu/info/people/vogels" #np.random.choice(url)
print("Page d'exemple:", example_page, "dans unlabeled:", example_page in unlabeled)



=== url ===
0                 http://cam.cornell.edu/ph/index.html
1           http://cam.cornell.edu/~baggett/index.html
2                          http://cs-tr.cs.cornell.edu
3    http://cs.cornell.edu/info/courses/current/cs4...
4    http://cs.cornell.edu/info/courses/fall-95/cs4...
Name: 0, dtype: object

=== labels ===
0    student
1    student
2    project
3     course
4     course
Name: 1704, dtype: object

=== attributes ===
   1     2     3     4     5     6     7     8     9     10    ...   1694  \
0     0     0     0     0     0     0     0     0     0     0  ...      0   
1     0     0     0     0     0     0     0     0     1     0  ...      0   
2     0     0     0     0     0     0     0     0     0     0  ...      0   
3     0     0     0     0     0     0     0     0     0     0  ...      0   
4     0     0     0     0     0     0     0     0     0     0  ...      0   

   1695  1696  1697  1698  1699  1700  1701  1702  1703  
0     0     0     0     0     0     0    

In [88]:
# Conversion des labels depuis string vers int
url_to_labels = dict(zip(url, labels))
unique_labels = sorted(np.unique(labels))
labels_to_id = dict(zip(unique_labels, range(len(unique_labels))))
url_pos = dict(zip(url, range(M)))

# Parsing du réseau depuis fichier 'cites'
network = {}
for line in cites.iterrows():
    src = line[1][1]
    dest = line[1][0]
    if src in network:
        network[src].add(dest)
    else:
        network[src] = set([dest])

print("Voisins de "+example_page+" :")
if example_page in network:
    print("(url, label, unlabeled)")
    print([(v, labels_to_id[url_to_label[v]], v in unlabeled) for v in network[example_page]])
else:
    print("Pas de citations depuis cette page")


Voisins de http://simon.cs.cornell.edu/info/people/vogels :
(url, label, unlabeled)
[('http://www.cs.cornell.edu/info/people/tve/tve.html', 1, False), ('http://www.cs.cornell.edu/info/faculty/brian_smith.html', 1, False), ('http://www.cs.cornell.edu/info/people/rvr/rvr.html', 3, False)]


In [89]:
# Comptage des classes des voisins
# Dans 'count' : 1 ligne = 1 url, 1 colonne = 1 classe (classes triées par ordre alphabétique) 
count = np.zeros((M, len(unique_labels)))

# On itère sur tous les noeuds qui ont des liens sortants
for noeud, voisins in network.items():
    position = url_pos[noeud]
    for voisin in network[noeud]:
        voisin_label = label_to_id[url_to_label[voisin]]
        if voisin not in unlabeled:
            count[position, voisin_label] += 1
            
print(count[url_pos[example_page]])

[ 0.  2.  0.  1.  0.]


In [90]:
# Classification

from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 
from sklearn.grid_search import GridSearchCV

X = np.concatenate((np.array(attributes),count),axis=1)
X = preprocessing.scale(X)
Y = np.array([label_to_id[lab] for lab in label])

idx = set(range(M))
test_idx = set([url_pos[u] for u in unlabeled])
train_idx = idx - test_idx

X_train, X_test = X[list(train_idx)], X[list(test_idx)]
Y_train, Y_test = Y[list(train_idx)], Y[list(test_idx)]

clf = RandomForestClassifier()

clf.fit(X_train, Y_train)
print(clf.score(X_test, Y_test))
print(np.unique(y, return_counts=True))
print(clf.predict(X_test))
print(Y_test)



0.736842105263
(array([0, 1, 2, 3, 4]), array([42, 32, 19, 19, 83]))
[3 0 4 2 4 1 0 4 4 4 4 1 4 4 0 4 4 0 0]
[3 0 1 4 2 3 0 4 4 4 4 1 4 4 0 4 1 0 0]
