# I) IMPORTING TRAINING AND TESTING SETS

In [1]:
import sklearn
from pandas import read_csv
import os
import time
import networkx as nx
import time
print os.popen("ls").readlines()

train_set = read_csv('r8-train-stemmed.txt', header = None, sep='\t', names = ['label', 'document'])
X_train = train_set['document']
Y_train = train_set['label']

test_set = read_csv('r8-test-stemmed.txt', header = None, sep='\t', names = ['label', 'document'])
X_test = test_set['document']
Y_test = test_set['label']

['Notebook_Graph_of_Words.ipynb\n', 'README.md\n', 'r8-test-stemmed.txt\n', 'r8-train-stemmed.txt\n']


# II) BUILDING THE BAG OF WORDS FOR TRAINING AND TESTING SETS #

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_bag_of_words(X):

    vectorizer = TfidfVectorizer(min_df=1)
    X_train_bag_of_words = vectorizer.fit_transform(X)
    return (vectorizer, X_train_bag_of_words)

tic=time.time()
print '***** Building the BoW for the training set *****'
(vectorizer_train, X_train_bag_of_words) = get_bag_of_words(X_train)
print '***** Building the BoW for the testing set *****'
X_test_bag_of_words = vectorizer_train.transform(X_test)
toc=time.time()-tic
print 'Done in {0} seconds'.format(round(toc,2))

print X_train_bag_of_words.shape, type(X_train_bag_of_words) #nb lines = nb documents / nb columns = nb features (i.e nb tokens, i.e nb words)
#X_tfidf is an object "scipy.sparse.csr.csr_matrix" (to store sparse matrix)

***** Building the BoW for the training set *****
***** Building the BoW for the testing set *****
Done in 0.76 seconds
(5485, 14575) <class 'scipy.sparse.csr.csr_matrix'>


# III) BUILDING THE GRAPHS OF WORDS FOR TRAINING AND TESTING SETS

## III) 1. Defining all parameters

In [3]:
isdirected=False
isweighted=True
size_window=4

centrality_measures=['pagerank','degree']
centrality_measure=centrality_measures[0]

## III) 2. Functions to build the graphs of words for every document

In [4]:
def get_graph_from_document(doc_str, isdirected, isweighted, size_window):
    #doc_str is a string (i.e the document)
    #doc_str should have more words than size_window
    import networkx as nx
    doc_array = doc_str.split()
    N = len(doc_array)
    
    if isdirected:
        G = nx.DiGraph()
    else:
        G=nx.Graph()
        
    for j in range(N):
        for i in range(max(j-size_window+1,0),j):
            if G.has_edge(doc_array[i], doc_array[j]):
                if isweighted:
                    # we added this one before, just increase the weight by one
                    G[doc_array[i]][doc_array[j]]['weight'] += 1
            else:
                # new edge. add with weight=1
                G.add_edge(doc_array[i], doc_array[j], weight=1)

    return G

def get_gow(corpus, isdirected, isweighted, size_window):
    dict_graph_of_words = dict()
    
    for i in range(len(corpus)):
            dict_graph_of_words[i] = get_graph_from_document(corpus[i],isdirected,isweighted, size_window)
        
    return dict_graph_of_words

## III) 3. Building the graph of words for every training document

In [5]:
t_begin = time.time()    
print '***** Building the graph of words for every training document *****'
print 'Directed Graphs : '+ str(isdirected)     
print 'Weighted Graphs : '+ str(isweighted)
print 'Window size : ' + str(size_window)
dict_graph_of_words = get_gow(X_train,isdirected,isweighted,size_window)
t_end = time.time() - t_begin
print 'Done in {0} seconds'.format(round(t_end,2)) 

***** Building the graph of words for every training document *****
Directed Graphs : False
Weighted Graphs : True
Window size : 4
Done in 4.97 seconds


## III) 4. Functions to compute the graph-based IDF values from the training corpus

In [6]:
def get_graph_from_all_documents(corpus, isdirected, isweighted, size_window):
    # corpus is X_train
    import networkx as nx
    
    if isdirected:
        G = nx.DiGraph()
    else:
        G=nx.Graph()
    
    for i in range(len(corpus)):
        doc_array=corpus[i].split()
        N = len(doc_array)
        
        for j in range(N):
            for i in range(max(j-size_window+1,0),j):
                if G.has_edge(doc_array[i], doc_array[j]):
                    if isweighted:
                        # we added this one before, just increase the weight by one
                        G[doc_array[i]][doc_array[j]]['weight'] += 1
                else:
                    # new edge. add with weight=1
                    G.add_edge(doc_array[i], doc_array[j], weight=1)

    return G

def idf_graph_words_corpus(graph_all_docs, vocab, centrality_measure):
    # vocab : dict('word':index_column)
    dict_idf_graph=dict()
    
    if centrality_measure=='pagerank':
        iterator_measure=nx.pagerank(graph_all_docs, max_iter=150).iteritems()
    elif centrality_measure=='degree':
        iterator_measure=nx.betweenness_centrality(graph_all_docs,weight='weight').iteritems()
    for word, centrality_value in iterator_measure:
        dict_idf_graph[vocab[word]] = idf_graph(centrality_value,graph_all_docs)
    
    return dict_idf_graph # returns a dictionary with centrality values of aggregated graph (or a function of them)

def idf_graph(centrality_value,graph_all_docs):
    import numpy as np
    return np.log(graph_all_docs.number_of_nodes()/float(centrality_value)) 
    # computes a formula for idf, given centrality value

## III) 5. Create a graph for the whole corpus of documents

In [7]:
tic = time.time()
print '***** Building a graph of words for the whole training corpus *****'
print 'Directed Graphs : '+ str(isdirected)     
print 'Weighted Graphs : '+ str(isweighted)
print 'Window size : ' + str(size_window)
graph_all_docs=get_graph_from_all_documents(X_train,isdirected,isweighted,size_window)
toc = time.time() - tic
print 'Number of nodes in the graph : '+ str(graph_all_docs.number_of_nodes()) # we find the same number of words in the dictionnary
print 'Done in {0} seconds'.format(round(toc,2))  

***** Building a graph of words for the whole training corpus *****
Directed Graphs : False
Weighted Graphs : True
Window size : 4
Number of nodes in the graph : 14575
Done in 3.42 seconds


## III) 6. Find the graph-based IDF values for the graph of documents

In [8]:
vocab_train = vectorizer_train.vocabulary_
t_begin = time.time()  
print '***** Computing the graph-based IDF values for the training corpus *****'
print 'Directed Graphs : '+ str(isdirected)     
print 'Weighted Graphs : '+ str(isweighted)
print 'Window size : ' + str(size_window)
dict_idf_graph=idf_graph_words_corpus(graph_all_docs,vocab_train,centrality_measure)
t_end = time.time() - t_begin
print 'Done in {0} seconds'.format(round(t_end,2))

***** Computing the graph-based IDF values for the training corpus *****
Directed Graphs : False
Weighted Graphs : True
Window size : 4
Done in 28.94 seconds


## III) 7. Define a function that builds the graph-based TW-IDF matrix for a given corpus

In [9]:
def gow_to_sparse_matrix(dict_graph_of_words, shape_bag_of_words, vocab,dict_idf_graph, centrality_measure):
    #vocab : dict('word':index_column)
    from scipy.sparse import dok_matrix
    
    X_graph_of_words = dok_matrix(shape_bag_of_words)
    
    for i in range(len(dict_graph_of_words)):
        if (i%1000 == 0):
            print '{0} out of {1}'.format(i, len(dict_graph_of_words))
        if centrality_measure=='pagerank':
            iterator_measure=nx.pagerank(dict_graph_of_words[i],max_iter=150).iteritems()
        elif centrality_measure=='degree':
            iterator_measure=nx.betweenness_centrality(dict_graph_of_words[i],weight='weight').iteritems()
 
        for word, centrality_value in iterator_measure:
            if word in vocab:
                X_graph_of_words[i,vocab[word]] = centrality_value*dict_idf_graph[vocab[word]]
                # We can even compute a different function of these two terms (with logarithmic weighting)
                
    return X_graph_of_words.tocsr()

## III) 8. Compute the graph-based TW-IDF matrix for training set

In [10]:
tic = time.time()
print '***** Computing the graph-TW-IDF matrix for the training corpus *****'
print 'Directed Graphs : '+ str(isdirected)     
print 'Weighted Graphs : '+ str(isweighted)
print 'Window size : ' + str(size_window)
print 'Centrality measure : ' + str(centrality_measure)
X_train_graph_of_words = gow_to_sparse_matrix(dict_graph_of_words, X_train_bag_of_words.shape, vocab_train, dict_idf_graph,centrality_measure)
toc = time.time() - tic
print 'Done in {0} seconds'.format(round(toc,2))

***** Computing the graph-TW-IDF matrix for the training corpus *****
Directed Graphs : False
Weighted Graphs : True
Window size : 4
Centrality measure : pagerank
0 out of 5485
1000 out of 5485
2000 out of 5485
3000 out of 5485
4000 out of 5485
5000 out of 5485
Done in 63.61 seconds


## III) 9. Compute the graph-based TW-IDF matrix for the testing set

In [12]:
tic = time.time()
print '***** Computing the graph-TW-IDF matrix for the testing corpus *****'
print 'Directed Graphs : '+ str(isdirected)     
print 'Weighted Graphs : '+ str(isweighted)
print 'Window size : ' + str(size_window)
print 'Centrality measure : ' + str(centrality_measure)
dict_test_graph_of_words = get_gow(X_test,isdirected,isweighted,size_window)
vocab_train = vectorizer_train.vocabulary_
X_test_graph_of_words = gow_to_sparse_matrix(dict_test_graph_of_words, (len(X_test), len(vocab_train)), vocab_train, dict_idf_graph, centrality_measure)
toc = time.time() - tic
print 'Done in {0} seconds'.format(round(toc,2))

***** Computing the graph-TW-IDF matrix for the testing corpus *****
Directed Graphs : False
Weighted Graphs : True
Window size : 4
Centrality measure : pagerank
0 out of 2189
1000 out of 2189
2000 out of 2189
Done in 26.54 seconds


# IV) CLASSIFICATION PARAMETERS

In [22]:
use_feature_selection = False # If True : Recursive Feature Elimination
nb_features_to_select = 300
with_SVD=False # Use SVD reduced values, or not (change to False, only with linearSVM learning)
graph_features=False # Change to True if you want to see the cv-plots for feature selection without SVD

classifiers_text=['linearSVM','gaussianSVM','logistic_reg','Adaboost']
string_classifier=classifiers_text[0]

if string_classifier=='gaussianSVM':
    with_SVD=True
if use_feature_selection:
    with_SVD=False

# V) DIMENSIONALITY REDUCTION : SVD Decomposition

In [21]:
if with_SVD :
    from sklearn.decomposition import TruncatedSVD #absolutely need to upgrade scikitlearn to 0.15.2
    n_components=500
    
    print '***** Computing SVD Decomposition for BoW and GoW with {0} components *****'.format(n_components)
    tic = time.time()
    svd_bag = TruncatedSVD(n_components = n_components, n_iter = 5, random_state=42)
    svd_graph = TruncatedSVD(n_components = n_components, n_iter = 5, random_state=42)
    svd_bag.fit(X_train_bag_of_words)
    svd_graph.fit(X_train_graph_of_words)
    toc = time.time() - tic
    print 'Done in {0} seconds'.format(round(toc,2))
    
    print 'Explained variance ratio for BoW: ' + str(100*svd_bag.explained_variance_ratio_.sum())+ ' %'
    print 'Explained variance ratio for GoW : ' + str(100*svd_graph.explained_variance_ratio_.sum())+ ' %'
    
    X_bag_svd_reduced = svd_bag.transform(X_train_bag_of_words)
    X_graph_svd_reduced = svd_graph.transform(X_train_graph_of_words)
    
    # Reduce test set with SVD decomposition
    print '***** SVD-Transform of X_test *****'
    tic = time.time()
    X_test_bag_svd_reduced = svd_bag.transform(X_test_bag_of_words)
    X_test_graph_svd_reduced = svd_graph.transform(X_test_graph_of_words)
    print 'Done in {0} seconds'.format(round(time.time() - tic,2))

***** Computing SVD Decomposition for BoW and GoW with 500 components *****
Done in 10.9 seconds
Explained variance ratio for BoW: 58.6360111278 %
Explained variance ratio for GoW : 80.5184797473 %
***** SVD-Transform of X_test *****
Done in 0.18 seconds


# VI) CLASSIFICATION TASK

In [24]:
print '***** Classification task *****'

# In[12]:

from sklearn import cross_validation
from sklearn import svm
from sklearn.cross_validation import KFold
import numpy as np
import operator

Y = Y_train
cv=KFold(X_train.shape[0], n_folds=5,shuffle=True)

***** Classification task *****


## VI) 1. RBF SVM

In [34]:
if (string_classifier=='gaussianSVM'): 

    # BAG OF WORDS
    X = X_bag_svd_reduced
    score_cv = dict()
    gamma_list = [0.1] #, 1.0, 10.0] takes too much time otherwise
    C_list = [0.1] #, 1.0, 10.0]
    print '***** Cross-validation for RBF SVM and BoW *****'
    tic = time.time()
    for gamma_cv in gamma_list:
        for C_cv in C_list:
            print 'Gamma : {0}/{1}, Cost : {2}/{3}'.format(gamma_list.index(gamma_cv)+1,len(gamma_list),C_list.index(C_cv)+1,len(C_list))
            clf = svm.SVC(kernel='rbf', C=C_cv, gamma = gamma_cv,class_weight='auto')
            score_cv[(C_cv, gamma_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
            
    (C_opt, gamma_opt) = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+','+ ' Gamma '+str(gamma_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'
    
    clf_bag_rbf = svm.SVC(kernel='rbf', C=C_opt, gamma = gamma_opt,class_weight='auto')
    clf_bag_rbf.fit(X,Y)
    print 'Done in {0} seconds'.format(time.time()-tic)
    
    
    # GRAPH OF WORDS
    X = X_graph_svd_reduced
    score_cv = dict()
    gamma_list = [0.1] #, 1.0, 10.0]
    C_list = [0.1] #, 1.0, 10.0]
    print '***** Cross-validation for RBF SVM and GoW *****'
    tic = time.time()
    for gamma_cv in gamma_list:
        for C_cv in C_list:
            print 'Gamma : {0}, Cost : {1}'.format(gamma_cv,C_cv)
            clf = svm.SVC(kernel='rbf', C=C_cv, gamma = gamma_cv,class_weight='auto') # Watch out the class_weight
            score_cv[(C_cv, gamma_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
            
    (C_opt, gamma_opt) = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+','+ ' Gamma '+str(gamma_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'
    
    clf_graph_rbf = svm.SVC(kernel='rbf', C=C_opt, gamma = gamma_opt,class_weight='auto')
    clf_graph_rbf.fit(X,Y)
    print 'Done in {0} seconds'.format(time.time()-tic)

***** Cross-validation for RBF SVM and BoW *****
Gamma : 1/1, Cost : 1/1
Maximum f1 score obtained for Cost 0.1, Gamma 0.1 : 12.437740681 %
Done in 340.555930138 seconds
***** Cross-validation for RBF SVM and GoW *****
Gamma : 0.1, Cost : 0.1
Maximum f1 score obtained for Cost 0.1, Gamma 0.1 : 78.5717272712 %
Done in 280.953822851 seconds


## VI) 2. Linear SVM without Recursive Feature Elimination

In [28]:
if (string_classifier=='linearSVM')&(not use_feature_selection):

    # Cross-validate for cost parameter C
    score_cv = dict()
    C_list = [0.1,1.0,10.0]
    
    # Bag of Words
    X=X_train_bag_of_words
    if with_SVD:
        X=X_bag_svd_reduced
        
    print '***** Cross-validation for linear SVM and BoW *****'
    tic=time.time()
    for C_cv in C_list:
            print 'Cost : {0}/{1}'.format(C_list.index(C_cv)+1,len(C_list))
            clf=svm.LinearSVC(C=C_cv,class_weight='auto')
            score_cv[(C_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
    
    C_opt = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'
    
    clf_bag=svm.LinearSVC(C=C_opt,class_weight='auto')
    clf_bag.fit(X,Y)
    print 'Done in {0} seconds'.format(time.time()-tic)
    
    # Graph of words
    X=X_train_graph_of_words
    if with_SVD:
        X=X_graph_svd_reduced
    print '***** Cross-validation for linear SVM and GoW *****'
    tic=time.time()
    for C_cv in C_list:
            print 'Cost : {0}/{1}'.format(C_list.index(C_cv)+1,len(C_list))
            clf=svm.LinearSVC(C=C_cv,class_weight='auto') # class_weight auto gives relative importance to classes
            score_cv[(C_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
    
    C_opt = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'
    
    clf_graph=svm.LinearSVC(C=C_opt,class_weight='auto')
    clf_graph.fit(X,Y)
    print 'Done in {0} seconds'.format(time.time()-tic)

***** Cross-validation for linear SVM and BoW *****
Cost : 1/3
Cost : 2/3
Cost : 3/3
Maximum f1 score obtained for Cost 10.0 : 97.1114829082 %
Done in 3.71687793732 seconds
***** Cross-validation for linear SVM and GoW *****
Cost : 1/3
Cost : 2/3
Cost : 3/3
Maximum f1 score obtained for Cost 10.0 : 97.2384260877 %
Done in 6.9106631279 seconds


## VI) 3. Linear SVM with Recursive Feature Elimination

In [30]:
# Cross-validate for cost parameter C
score_cv = dict()
C_list = [0.1,1.0,10.0]

if (string_classifier=='linearSVM')&use_feature_selection:
    
    ###### Bag of words on feature-cleaned data #######
    from sklearn.feature_selection import RFE
    print '***** Cross-validation for linearSVM and BoW, without SVD, with feature selection *****'
    
    clf=svm.LinearSVC(C=1.0,class_weight='auto')
    tic = time.time()
    nb_features_to_remove_at_each_step = 200 # We don't want it to be too slow
    rfecv_bag = RFE(clf, nb_features_to_select, step=nb_features_to_remove_at_each_step)
    rfecv_bag.fit(X_train_bag_of_words, Y)
    toc = round(time.time() - tic,2)
    print '{0}/{1} features selected'.format(rfecv_bag.n_features_,X_train_bag_of_words.shape[1])
    print 'Done in {0} seconds'.format(toc)
    X = rfecv_bag.transform(X_train_bag_of_words)
    X_test_bag_feature_selected = rfecv_bag.transform(X_test_bag_of_words)
    
    print
    tic=time.time()
    for C_cv in C_list:
            print 'Cost : {0}/{1}'.format(C_list.index(C_cv)+1,len(C_list))
            clf=svm.LinearSVC(C=C_cv,class_weight='auto')
            score_cv[(C_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
            
    C_opt = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'
    
    clf_bag_non_reduced=svm.LinearSVC(C=C_opt,class_weight='auto')
    clf_bag_non_reduced.fit(X,Y)
    print 'Done in {0} seconds'.format(time.time()-tic)

    ###### Graph of words on feature-cleaned data #######
    print '***** Cross-validation for linearSVM and GoW, without SVD, with feature selection *****'
    clf=svm.LinearSVC(C=1.0,class_weight='auto')
    tic = time.time()
    nb_features_to_remove_at_each_step = 200 # We don't want it to be too slow
    rfecv_graph = RFE(clf, nb_features_to_select, step=nb_features_to_remove_at_each_step)
    rfecv_graph.fit(X_train_graph_of_words, Y)
    toc = round(time.time() - tic,2)
    print '{0}/{1} features selected'.format(rfecv_graph.n_features_,X_train_bag_of_words.shape[1])
    print 'Done in {0} seconds'.format(toc)    
    X = rfecv_graph.transform(X_train_bag_of_words)
    X_test_graph_feature_selected = rfecv_graph.transform(X_test_graph_of_words)

    tic=time.time()
    for C_cv in C_list:
            print 'Cost : {0}/{1}'.format(C_list.index(C_cv)+1,len(C_list))
            clf=svm.LinearSVC(C=C_cv,class_weight='auto')
            score_cv[(C_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
            
    C_opt = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'
    
    clf_graph_non_reduced=svm.LinearSVC(C=C_opt,class_weight='auto')
    clf_graph_non_reduced.fit(X,Y)
    print 'Done in {0} seconds'.format(time.time()-tic)

***** Cross-validation for linearSVM and BoW, without SVD, with feature selection *****
300/14575 features selected
Done in 13.81 seconds

Cost : 1/3
Cost : 2/3
Cost : 3/3
Maximum f1 score obtained for Cost 10.0 : 97.6301889293 %
Done in 1.42059397697 seconds
***** Cross-validation for linearSVM and GoW, without SVD, with feature selection *****
300/14575 features selected
Done in 33.14 seconds
Cost : 1/3
Cost : 2/3
Cost : 3/3
Maximum f1 score obtained for Cost 10.0 : 97.6482536078 %
Done in 1.35689687729 seconds


## VI) 4. Cross-validation score vs number of features selected

In [None]:
if graph_features:
    ## Bag of words on non-SVD-reduced data and with LinearSVM
    X=X_train_bag_of_words
    if use_feature_selection:
        print 'Cross-validation error plot for linearSVM and Bag of Words, without SVD, with feature selection'
        # The "accuracy" scoring is proportional to the number of correct
        # classifications
        from sklearn.feature_selection import RFECV
        from sklearn.cross_validation import StratifiedKFold
        clf=svm.LinearSVC(C=1.0,class_weight=None)
        step=300
        rfecv = RFECV(clf, step, cv=StratifiedKFold(Y,3, shuffle=True),scoring='f1_weighted')
        tic = time.time()
        rfecv.fit(X, Y)
        toc = round(time.time() - tic,2)
        print('Optimal number of features : {0} (in {1} seconds)'.format(rfecv.n_features_,toc))
        
        from matplotlib import pyplot as plt
        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected divided by step size")
        plt.ylabel("Cross validation f1 score for Bag of Words")
        plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
        plt.show()
    
    ## Graph of words on non-SVD-reduced data and with LinearSVM
    X=X_train_graph_of_words
    if use_feature_selection:
        print 'Cross-validation error plot for linearSVM and Graph of Words, without SVD, with feature selection'
        # The "accuracy" scoring is proportional to the number of correct
        # classifications
        clf=svm.LinearSVC(C=1.0,class_weight='auto')
        step=300
        rfecv = RFECV(clf, 300, cv=StratifiedKFold(Y,3),scoring='f1_weighted')
        tic = time.time()
        rfecv.fit(X, Y)
        toc = round(time.time() - tic,2)
        print('Optimal number of features : {0} (in {1} seconds)'.format(rfecv.n_features_,toc))
    
        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected divided by step size")
        plt.ylabel("Cross validation f1 score for Graph of Words")
        plt.plot(range(1, len(rfecv.grid_scores_)+1), rfecv.grid_scores_)
        plt.show()

Cross-validation error plot for linearSVM and Bag of Words, without SVD, with feature selection
Optimal number of features : 7675 (in 28.82 seconds)

## VI) 5. Logistic regression

In [36]:
if string_classifier=='logistic_reg':

    from sklearn.linear_model import LogisticRegression
    score_cv = dict()
    C_list = [0.1,1.0,5.0,10.0,50.0,100.0]
    
    # Bag of Words
    X=X_train_bag_of_words
    for C_cv in C_list:
            print 'Cost : {0}/{1}'.format(C_list.index(C_cv)+1,len(C_list))
            clf=LogisticRegression(C=C_cv,class_weight='auto') # class_weight=None gives better results
            score_cv[(C_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
    C_opt = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'

    clf_bag=LogisticRegression(C=C_opt,class_weight='auto')
    clf_bag.fit(X,Y)

    # Graph of Words
    X=X_train_graph_of_words
    for C_cv in C_list:
            print 'Cost : {0}/{1}'.format(C_list.index(C_cv)+1,len(C_list))
            clf=LogisticRegression(C=C_cv,class_weight='auto') # class_weight=None gives better results
            score_cv[(C_cv)] = np.mean(cross_validation.cross_val_score(clf, X, Y, cv=cv, scoring='f1_weighted'))
    C_opt = max(score_cv.iteritems(), key=operator.itemgetter(1))[0]
    print 'Maximum f1 score obtained for Cost ' + str(C_opt)+' : '+ str(100*max(score_cv.values()))+ ' %'

    clf_graph=LogisticRegression(C=C_opt,class_weight='auto')
    clf_graph.fit(X,Y)  

Cost : 1/6
Cost : 2/6
Cost : 3/6
Cost : 4/6
Cost : 5/6
Cost : 6/6
Maximum f1 score obtained for Cost 100.0 : 97.0330960226 %
Cost : 1/6
Cost : 2/6
Cost : 3/6
Cost : 4/6
Cost : 5/6
Cost : 6/6
Maximum f1 score obtained for Cost 100.0 : 97.3299228304 %


## VI) 6. Adaboost

In [38]:
# We stay with default number of classifiers (50 Decision Trees), slow method
if string_classifier=='Adaboost':   
    from sklearn.ensemble import AdaBoostClassifier

    clf_bag=AdaBoostClassifier()
    clf_bag.fit(X_train_bag_of_words,Y)

    clf_graph=AdaBoostClassifier()
    clf_graph.fit(X_train_graph_of_words,Y)

# VII) COMPUTE THE TEST LABELS AND SCORES 

In [40]:
## Test labels
if with_SVD:
    if string_classifier=='gaussianSVM':
        Y_test_bag=clf_bag_rbf.predict(X_test_bag_svd_reduced) # Data needs to be reduced with Gaussian SVM
        Y_test_graph=clf_graph_rbf.predict(X_test_graph_svd_reduced)
    else:
        Y_test_bag=clf_bag.predict(X_test_bag_svd_reduced)
        Y_test_graph=clf_graph.predict(X_test_graph_svd_reduced)

else:
    if use_feature_selection:
        Y_test_bag=clf_bag_non_reduced.predict(X_test_bag_feature_selected)
        Y_test_graph=clf_graph_non_reduced.predict(X_test_graph_feature_selected)
    else:
        Y_test_bag=clf_bag.predict(X_test_bag_of_words)
        Y_test_graph=clf_graph.predict(X_test_graph_of_words)

## Compute the different metrics
string_svd='No SVD'
if with_SVD:
    string_svd='With SVD, '+ str(n_components)+ ' components'
string_directed='Undirected graphs'
if isdirected:
     string_directed='Directed graphs'
string_weighted='Unweighted graphs'
if isweighted:
     string_weighted='Weighted graphs'

# Print to output file
import sys
orig_stdout = sys.stdout
f = file('NewResults.txt', 'a')
sys.stdout = f

print '***************************'
print 'BAG-OF-WORDS APPROACH'
print 'SVD : '+ str(with_SVD)
print 'Feature Selection : ' + str(use_feature_selection)
print string_classifier
print
print 'Micro-averaging : ' + str(100*sklearn.metrics.precision_score(Y_test,Y_test_bag,average='micro'))+' %'
print 'Macro-averaging : ' + str(100*sklearn.metrics.precision_score(Y_test,Y_test_bag,average='macro'))+' %'
print 'Weighted-averaging : ' + str(100*sklearn.metrics.precision_score(Y_test,Y_test_bag,average='weighted'))+' %'
print '***************************'

print
print '***************************'
print 'GRAPH-OF-WORDS APPROACH'
print 'SVD : '+ str(with_SVD)
print 'Feature Selection : ' + str(use_feature_selection)
print string_classifier
print string_directed
print string_weighted
print 'Window size : '+str(size_window)
print 'Centrality measure : '+ centrality_measure
print
print 'Micro-averaging : ' + str(100*sklearn.metrics.precision_score(Y_test,Y_test_graph,average='micro'))+' %'
print 'Macro-averaging : ' + str(100*sklearn.metrics.precision_score(Y_test,Y_test_graph,average='macro'))+' %'
print 'Weighted-averaging : ' + str(100*sklearn.metrics.precision_score(Y_test,Y_test_graph,average='weighted'))+' %'
print '***************************'

sys.stdout = orig_stdout
f.close()

# Comments :
- As expected, linear SVMs perform best
- Best results for weighted, undirected graphs, 3 neighbors (window size 4), 'pagerank' centrality measure, class weighting, linear SVMs
- More computation requirements for 'pagerank', no feature selection, but $\textbf{linear SVMs are fast}$ and do not require dimensionality reduction here
- Gaussian SVMs are computationally unefficient, and give poor results
- Logistic regression almost as efficient with good cross-validation and same parameters
- Adaboost reaches $ \approx$ 79\% f1-score with 50 decision trees, but slow method