# Graph of Docs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

from GraphOfDocs import utils
from GraphOfDocs import select

from GraphOfDocs.neo4j_wrapper import Neo4jDatabase
from collections import Counter


%matplotlib inline

In [2]:
database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '123')

In [3]:
doc_communities = select.get_document_communities(database)
filtered_doc_communities = [doc_community for doc_community in doc_communities if doc_community[2] > 1]
selected_docs = sum([docs for _, docs, _ in filtered_doc_communities], [])
doc_communities_dict = {community_id: docs for community_id, docs, number_of_docs in filtered_doc_communities}
doc_to_community_dict = {doc: community_id for community_id, doc_community, _ in filtered_doc_communities for doc in doc_community}

In [4]:
# Read dataset, clean dataset and create a pandas dataframe of the dataset.
dataset = utils.read_dataset('GraphOfDocs/data/20news-18828-all/')
le = LabelEncoder()
# The class of each document can be found by simply split (character '_') its filename. E.g. comp.sys.mac.hardware_51712.
le.fit([file[0].split('_')[0] for file in dataset])
# Tuple: file identifier, file class, file class number, file text
clean_dataset = [(file[0], file[0].split('_')[0], le.transform([file[0].split('_')[0]])[0], ' '.join(utils.generate_words(file[1], extend_window=True, remove_stopwords=True, lemmatize=False, stemming=False))) for file in dataset]
df = pd.DataFrame(clean_dataset, columns=['identifier', 'class', 'class_number', 'text'])

df_not_selected = df[~df['identifier'].isin(selected_docs)]
df = df[df['identifier'].isin(selected_docs)]

#df_not_selected = shuffle(df_not_selected, random_state=42)
df = shuffle(df, random_state=42)



#df = df_not_selected


df.head(2)


Unnamed: 0,identifier,class,class_number,text
11876,rec.autos_102735,rec.autos,7,rdb1 cbnewsj att ronald deblock changing oil s...
17000,comp.windows.x_68077,comp.windows.x,5,boutilie rtsg mot eric boutilier looking windo...


In [5]:
X = df['text']
y = df['class_number']

In [6]:
def benchmark_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    return clf

## Running experiments with bag-of-words and widely-used classifiers.

In [7]:
cv = CountVectorizer()
# bag-of-words
X_transformed = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.33, random_state=42)
print('Train size %s' % X_train.shape[0])
print('Test size %s' % X_test.shape[0])
print('Number of features %s' % X_test.shape[1])

Train size 7778
Test size 3832
Number of features 71370


In [8]:
bag_of_words_classifiers = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    ('5-NN', KNeighborsClassifier(n_neighbors=5)),
    ('2-NN', KNeighborsClassifier(n_neighbors=2)),
    ('1-NN', KNeighborsClassifier(n_neighbors=1)),
    ('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    ('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    ('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_train, y_train, X_test, y_test)

#print('Naive Bayes 0.8699710331509495')
#print('Logistic Regression 0.8896041197296427')
#print('5-NN 0.46057289990344386')
#print('2-NN 0.4652397811393627')
#print('1-NN 0.6094303186353396')
#print('Neural Network 100x50 0.9087544254908272')
#print('Neural Network 500x250 0.9140650144834245')
#print('Neural Network 1000x500 0.9013517862890248')

Naive Bayes
0.9368475991649269
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9389352818371608
5-NN
0.5474947807933194
2-NN
0.6109081419624217
1-NN
0.7596555323590815
Neural Network 100x50
0.9548538622129437
Neural Network 500x250
0.9598121085594989
Neural Network 1000x500
0.9590292275574113


## Running experiments with bag-of-words, feature selection and widely-used classifiers.

### Performing feature selection using interpretable models.

#### Deciding which is the best model to use as feature selector (LinearSVC vs RidgeClassifier vs AdaBoostClassifier).

In [None]:
#benchmark_classifier(LinearSVC(), X_train, y_train, X_test, y_test)
#benchmark_classifier(RidgeClassifier(), X_train, y_train, X_test, y_test)
#benchmark_classifier(AdaBoostClassifier(), X_train, y_train, X_test, y_test)

# best LinearSVC
print('LinearSVC 0.8879948503379466')
print('RidgeClassifier 0.8558094625040231')
print('AdaBoostClassifier 0.5413582233665916')

In [None]:
selected_features = SelectFromModel(estimator=LinearSVC()).fit_transform(X_transformed, y)

In [None]:
X_selected_train, X_selected_test, y_selected_train, y_selected_test = train_test_split(selected_features, y, test_size=0.33, random_state=42)
print('Train size %s' % X_selected_train.shape[0])
print('Test size %s' % X_selected_test.shape[0])
print('Number of features %s' % X_selected_test.shape[1])

In [None]:
bag_of_words_selected_classifiers = [
    #('Naive Bayes', MultinomialNB()),
    #('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    #('5-NN', KNeighborsClassifier(n_neighbors=5)),
    #('2-NN', KNeighborsClassifier(n_neighbors=2)),
    #('1-NN', KNeighborsClassifier(n_neighbors=1)),
    #('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_selected_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_selected_train, y_selected_train, X_selected_test, y_selected_test)

print('Naive Bayes 0.8944319279047312')
print('Logistic Regression 0.8925008046346958')
print('5-NN 0.4787576440296106')
print('2-NN 0.48455101383971677')
print('1-NN 0.6221435468297393')
print('Neural Network 100x50 0.8978113936272932')
print('Neural Network 500x250 0.9110074026392018')
print('Neural Network 1000x500 0.8759253299002253')

### Performing feature selection by removing features with low variance.

In [None]:
sel = VarianceThreshold(threshold=0.8)

In [None]:
selected_variance_features = sel.fit_transform(X_transformed)
X_selected_variance_train, X_selected_variance_test, y_selected_variance_train, y_selected_variance_test = train_test_split(selected_variance_features, y, test_size=0.33, random_state=42)
print('Train size %s' % X_selected_variance_train.shape[0])
print('Test size %s' % X_selected_variance_test.shape[0])
print('Number of features %s' % X_selected_variance_test.shape[1])

In [None]:
bag_of_words_selected_variance_classifiers = [
    #('Naive Bayes', MultinomialNB()),
    #('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    #('5-NN', KNeighborsClassifier(n_neighbors=5)),
    #('2-NN', KNeighborsClassifier(n_neighbors=2)),
    #('1-NN', KNeighborsClassifier(n_neighbors=1)),
    #('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_selected_variance_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_selected_variance_train, y_selected_variance_train, X_selected_variance_test, y_selected_variance_test)

## Running experiments with graph-of-words.

## Running experiments with graph-of-docs.

In [9]:
_, test_docs = train_test_split(df['identifier'], test_size=0.33, random_state=42)
test_docs = list(test_docs)

In [10]:
class_true = []
class_pred = []
for test_doc in test_docs:
    community_id = doc_to_community_dict[test_doc]
    community_docs = doc_communities_dict[community_id]
    classes = [doc.split('_')[0] for doc in community_docs if doc != test_doc]
    
    correct_class = test_doc.split('_')[0]
    classified_class = Counter(classes).most_common(1)[0][0]
    class_true.append(correct_class)
    class_pred.append(classified_class)
    #print(correct_class, classified_class)
print('Accuracy: %s' % (accuracy_score(class_true, class_pred)))
# accuracy: 0.9752087682672234

Accuracy: 0.9752087682672234


In [None]:
X = ['fo ha', 'foo bar', 'foo bar', 'foo bar']
y = le.transform(['alt.atheism', 'comp.graphics', 'comp.graphics', 'comp.graphics'])

cv = CountVectorizer()
# bag-of-words
X_transformed = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.33, random_state=42)
print('Train size %s' % X_train.shape[0])
print('Test size %s' % X_test.shape[0])
print('Number of features %s' % X_test.shape[1])

In [None]:
select.get_all_tags_per_community

In [None]:
cv.transform(['foo ha']).toarray()

In [None]:
cv.vocabulary_

In [None]:
le.classes_

In [None]:
_, test_docs = train_test_split(df['identifier'], test_size=0.33, random_state=42)
test_docs = list(test_docs)

In [None]:
class_true = []
class_pred = []

for test_doc in test_docs[:]:
    doc_terms = select.get_document_terms(database, test_doc, group_by_word_community_id=True)
    #print(doc_terms)
    #print(doc_terms)
    #most_common_word_community = Counter([community_id for _, _, community_id in doc_terms]).most_common()[0][0]
    #print(most_common_word_community)
    #selected_terms = sorted([(term, pagerank) for term, pagerank, community_id in doc_terms if community_id == most_common_word_community], key=lambda t: t[1], reverse=True)
    
    #selected_terms = [(term, pagerank) for term, pagerank, community_id in doc_terms if community_id == most_common_word_community]
    #selected_terms = doc_terms[0][1][:50]
    #print(selected_terms)
    #selected_terms = sum([word_community[1][:25] for word_community in doc_terms], [])
    selected_terms = []
    for word_community in doc_terms[:1]:
        selected_terms += word_community[1][:25]

    selected_terms = [term for term, _ in selected_terms]
    #print(selected_terms)
    #print(selected_terms)
    docs = []
    communities = select.get_communities_by_tags(database, selected_terms)
    if communities is None:
        print('communities is none')
        continue
    docs += sum([community_docs for _, community_docs in communities], [])
    classes = [doc.split('_')[0] for doc in docs]

    correct_class = test_doc.split('_')[0]
    if len(classes) == 0:
        print('zero class, ignore this element')
        continue
    classified_class = Counter(classes).most_common(1)[0][0]
    class_true.append(correct_class)
    class_pred.append(classified_class)
    #print(correct_class, classified_class)
print('Accuracy: %s' % (accuracy_score(class_true, class_pred)))

## Discussion

In [None]:
community_tags = {key: array for key, array in select.get_all_tags_per_community(database)}

In [None]:
documents_per_community = {key: array for key, array, _ in select.get_document_communities(database)}

In [None]:
len(documents_per_community.keys())

In [None]:
len(documents_per_community.keys())

In [None]:
for key in documents_per_community.keys():
    p = community_tags[key]

In [None]:
len(community_tags)