# Graph of Docs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from GraphOfDocs import utils
from GraphOfDocs import select
from GraphOfDocs.neo4j_wrapper import Neo4jDatabase

from collections import Counter

%matplotlib inline

In [2]:
database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '123')

In [3]:
doc_communities = select.get_document_communities(database)
filtered_doc_communities = [doc_community for doc_community in doc_communities if doc_community[2] > 1]
selected_docs = sum([docs for _, docs, _ in filtered_doc_communities], [])
doc_communities_dict = {community_id: docs for community_id, docs, number_of_docs in filtered_doc_communities}
doc_to_community_dict = {doc: community_id for community_id, doc_community, _ in filtered_doc_communities for doc in doc_community}

In [4]:
# Read dataset, clean dataset and create a pandas dataframe of the dataset.
dataset = utils.read_dataset('GraphOfDocs/data/20news-18828-all/')
le = LabelEncoder()
# The class of each document can be found by simply split (character '_') its filename. E.g. comp.sys.mac.hardware_51712.
le.fit([file[0].split('_')[0] for file in dataset])
# Tuple: file identifier, file class, file class number, file text
clean_dataset = [(file[0], file[0].split('_')[0], le.transform([file[0].split('_')[0]])[0], ' '.join(utils.generate_words(file[1], extend_window=True, remove_stopwords=True, lemmatize=False, stemming=False))) for file in dataset]
df = pd.DataFrame(clean_dataset, columns=['identifier', 'class', 'class_number', 'text'])

df_not_selected = df[~df['identifier'].isin(selected_docs)]
df = df[df['identifier'].isin(selected_docs)]

#df_not_selected = shuffle(df_not_selected, random_state=42)
df = shuffle(df, random_state=42)
df.head(2)


Unnamed: 0,identifier,class,class_number,text


In [5]:
X = df['text']
y = df['class_number']

In [6]:
def benchmark_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    return clf

## Running experiments with bag-of-words and widely-used classifiers.

In [7]:
cv = CountVectorizer()
# bag-of-words
X_transformed = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.33, random_state=42)
print('Train size %s' % X_train.shape[0])
print('Test size %s' % X_test.shape[0])
print('Number of features %s' % X_test.shape[1])

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
bag_of_words_classifiers = [
    ('Naive Bayes', MultinomialNB()),
    #('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    #('5-NN', KNeighborsClassifier(n_neighbors=5)),
    #('2-NN', KNeighborsClassifier(n_neighbors=2)),
    #('1-NN', KNeighborsClassifier(n_neighbors=1)),
    #('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_train, y_train, X_test, y_test)

## Running experiments with graph-of-docs.

In [None]:
_, test_docs = train_test_split(df['identifier'], test_size=0.33, random_state=42)
test_docs = list(test_docs)

In [None]:
class_true = []
class_pred = []
for test_doc in test_docs:
    community_id = doc_to_community_dict[test_doc]
    community_docs = doc_communities_dict[community_id]
    classes = [doc.split('_')[0] for doc in community_docs if doc != test_doc]
    
    correct_class = test_doc.split('_')[0]
    classified_class = Counter(classes).most_common(1)[0][0]
    class_true.append(correct_class)
    class_pred.append(classified_class)
print('Accuracy: %s' % (accuracy_score(class_true, class_pred)))
# accuracy: 0.9752087682672234