# Graph of Docs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from GraphOfDocs import utils
from GraphOfDocs import select
from GraphOfDocs.neo4j_wrapper import Neo4jDatabase

from collections import Counter

%matplotlib inline

In [2]:
database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '123')

In [3]:
doc_communities = select.get_document_communities(database)
filtered_doc_communities = [doc_community for doc_community in doc_communities if doc_community[2] > 1]
selected_docs = sum([docs for _, docs, _ in filtered_doc_communities], [])
doc_communities_dict = {community_id: docs for community_id, docs, number_of_docs in filtered_doc_communities}
doc_to_community_dict = {doc: community_id for community_id, doc_community, _ in filtered_doc_communities for doc in doc_community}

In [4]:
len(selected_docs)

11610

In [5]:
# Read dataset, clean dataset and create a pandas dataframe of the dataset.
dataset = utils.read_dataset('/home/nkanak/Desktop/phd/experiments/GraphOfDocs/GraphOfDocs/data/20news-18828-all/')
le = LabelEncoder()
# The class of each document can be found by simply split (character '_') its filename. E.g. comp.sys.mac.hardware_51712.
le.fit([file[0].split('_')[0] for file in dataset])
# Tuple: file identifier, file class, file class number, file text
clean_dataset = [(file[0], file[0].split('_')[0], le.transform([file[0].split('_')[0]])[0], ' '.join(utils.generate_words(file[1], extend_window=True, insert_stopwords=False, lemmatize=False, stem=False))) for file in dataset]
df = pd.DataFrame(clean_dataset, columns=['identifier', 'class', 'class_number', 'text'])
df_all = df

#df_not_selected = df[~df['identifier'].isin(selected_docs)]
df = df[df['identifier'].isin(selected_docs)]

#df_not_selected = shuffle(df_not_selected, random_state=42)
df = shuffle(df, random_state=42)
df.head(2)

Unnamed: 0,identifier,class,class_number,text
11876,rec.autos_102735,rec.autos,7,rdb1 cbnewsj att ronald deblock changing oil s...
17000,comp.windows.x_68077,comp.windows.x,5,boutilie rtsg mot eric boutilier looking windo...


In [6]:
X = df['text']
y = df['class_number']

In [7]:
def benchmark_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(accuracy_score(y_test, y_pred))
    return clf

## Running experiments with bag-of-words and widely-used classifiers.

In [8]:
positions = [i for i in range(len(X))]

In [9]:
cv = CountVectorizer()
# bag-of-words
X_transformed = cv.fit_transform(X)

X_train, X_test, y_train, y_test, positions_train, positions_test = train_test_split(X_transformed, y, positions, test_size=0.33, random_state=42)
print('Train size %s' % X_train.shape[0])
print('Test size %s' % X_test.shape[0])
print('Number of features %s' % X_test.shape[1])

Train size 7778
Test size 3832
Number of features 71361


In [10]:
bag_of_words_classifiers = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    ('5-NN', KNeighborsClassifier(n_neighbors=5, weights='distance')),
    ('2-NN', KNeighborsClassifier(n_neighbors=2, weights='distance')),
    ('1-NN', KNeighborsClassifier(n_neighbors=1, weights='distance')),
    #('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

In [11]:
for classifier in bag_of_words_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_train, y_train, X_test, y_test)

Naive Bayes
0.9368475991649269
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9386743215031316
5-NN
0.6427453027139874
2-NN
0.755741127348643
1-NN
0.7596555323590815


## Running experiments with graph-of-docs.

### Graph-of-docs classifier

In [12]:
_, test_docs = train_test_split(df['identifier'], test_size=0.33, random_state=42)
test_docs = list(test_docs)

In [13]:
class_true = []
class_pred = []
for test_doc in test_docs:
    community_id = doc_to_community_dict[test_doc]
    community_docs = doc_communities_dict[community_id]
    classes = [doc.split('_')[0] for doc in community_docs if doc != test_doc]
    
    correct_class = test_doc.split('_')[0]
    classified_class = Counter(classes).most_common(1)[0][0]
    class_true.append(correct_class)
    class_pred.append(classified_class)
print('Accuracy: %s' % (accuracy_score(class_true, class_pred)))
# accuracy: 0.9752087682672234

Accuracy: 0.9752087682672234


### Feature selection using classical methods

### Feature selection using graph-of-docs

#### Create a vocabulary with the TOP N words of each community of docs

In [14]:
train_docs = list(df.iloc[positions_train]['identifier'])

In [15]:
vocabulary = []
for doc in train_docs:
    for word in select.get_community_tags(database, doc_to_community_dict[doc], top_terms=250):
        vocabulary.append(word)
vocabulary = list(set(vocabulary))

In [16]:
cv = CountVectorizer(vocabulary=vocabulary)
# bag-of-words
X_transformed = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.33, random_state=42)
print('Train size %s' % X_train.shape[0])
print('Test size %s' % X_test.shape[0])
print('Number of features %s' % X_test.shape[1])

Train size 7778
Test size 3832
Number of features 37281


In [17]:
for classifier in bag_of_words_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_train, y_train, X_test, y_test)

Naive Bayes
0.941544885177453
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9399791231732777
5-NN
0.6907620041753654
2-NN
0.7549582463465553
1-NN
0.7630480167014614


### [tag1, tag2, ... tagN] -> class (Do this for each community of docs)

In [18]:
train_docs = list(df.iloc[positions_train]['identifier'])
test_docs = list(df.iloc[positions_test]['identifier'])
all_docs = test_docs + train_docs
unique_community_ids = list(set([doc_to_community_dict[doc] for doc in all_docs]))

In [19]:
communities_y = []
communities_tags = []
for community_id in unique_community_ids:
    # Find the most common community class
    community_docs = doc_communities_dict[community_id]
    classes = [doc.split('_')[0] for doc in community_docs]
    classified_class = Counter(classes).most_common(1)[0][0]
    communities_y.append(classified_class)
    # Get the most important tags of each community.
    communities_tags.append(' '.join(select.get_community_tags(database, community_id, top_terms=250)))

In [20]:
cv = CountVectorizer()
X_transformed = cv.fit_transform(communities_tags)

print('Number of features %s' % X_transformed.shape[1])
communities_y_encoded = le.transform(communities_y)

Number of features 38368


In [21]:
X_test_docs = []
for doc in list(df[df['identifier'].isin(test_docs)]['text']):
    X_test_docs.append(' '.join(list(set(doc.split()))))

X_test_docs_transformed = cv.transform(X_test_docs)
y_test = list(df[df['identifier'].isin(test_docs)]['class_number'])

In [22]:
X_transformed.shape

(4024, 38368)

In [23]:
for classifier in bag_of_words_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_transformed, communities_y_encoded, X_test_docs_transformed, y_test)

Naive Bayes
0.9611169102296451
Logistic Regression
0.9725991649269311
5-NN
0.759133611691023
2-NN
0.8812630480167014
1-NN
0.880741127348643
