# Graph of Docs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from GraphOfDocs import utils

%matplotlib inline

In [2]:
# Read dataset, clean dataset and create a pandas dataframe of the dataset.
dataset = utils.read_datasets('GraphOfDocs/data/20news-18828-all/')
le = LabelEncoder()
le.fit([file[0].split('_')[0] for file in dataset])
# Tuple: file identifier, file class, file class number, file text
clean_dataset = [(file[0], file[0].split('_')[0], le.transform([file[0].split('_')[0]])[0], ' '.join([token for token in utils.generate_words(file[1], remove_stopwords=True, lemmatize=False, stemming=False) if token != 'e5'])) for file in dataset]
df = pd.DataFrame(clean_dataset, columns=['identifier', 'class', 'class_number', 'text'])
df = shuffle(df, random_state=42)
df.head()

Unnamed: 0,identifier,class,class_number,text
2571,sci.space_60960,sci.space,14,jgreen trumpet calpoly james thomas green keep...
3143,comp.sys.mac.hardware_51712,comp.sys.mac.hardware,4,ebodin pearl tufts screen death mac plus 512 o...
409,sci.electronics_53973,sci.electronics,12,rgc3679 bcstec ca boeing robert g carpenter ra...
11847,rec.autos_101674,rec.autos,7,celeste express freedom msfc nasa gov celeste ...
16313,sci.crypt_15719,sci.crypt,11,smb research att steven bellovin new reason cl...


In [3]:
X = df['text']
y = df['class_number']

## Run experiments with bag-of-words and widely-used classifiers.

In [4]:
cv = CountVectorizer()
# bag-of-words
X_transformed = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.33, random_state=42)
print('Train size %s' % X_train.shape[0])
print('Test size %s' % X_test.shape[0])
print('Number of features %s' % X_test.shape[1])

Train size 12614
Test size 6214
Number of features 169617


In [5]:
bag_of_words_classifiers = [
    ('Naive bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    ('5-NN', KNeighborsClassifier(n_neighbors=5)),
    ('2-NN', KNeighborsClassifier(n_neighbors=2)),
    ('1-NN', KNeighborsClassifier(n_neighbors=1)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_classifiers:
    print(classifier[0])
    clf = classifier[1]
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))

Naive bayes
0.8604763437399421
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8925008046346958
5-NN
0.48567750241390406
2-NN
0.4908271644673318
1-NN
0.6076601223044737


## Run experiments with bag-of-words, feature selection and widely-used classifiers.

## Run experiments with graph-of-words.