# Graph of Docs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel

from GraphOfDocs import utils

%matplotlib inline

In [2]:
# Read dataset, clean dataset and create a pandas dataframe of the dataset.
dataset = utils.read_datasets('GraphOfDocs/data/20news-18828-all/')
le = LabelEncoder()
# The class of each document can be found by simply split (character '_') its filename. E.g. comp.sys.mac.hardware_51712.
le.fit([file[0].split('_')[0] for file in dataset])
# Tuple: file identifier, file class, file class number, file text
clean_dataset = [(file[0], file[0].split('_')[0], le.transform([file[0].split('_')[0]])[0], ' '.join(utils.generate_words(file[1], extend_window=True, remove_stopwords=True, lemmatize=False, stemming=False))) for file in dataset]
df = pd.DataFrame(clean_dataset, columns=['identifier', 'class', 'class_number', 'text'])
df = shuffle(df, random_state=42)
df.head(2)

Unnamed: 0,identifier,class,class_number,text
2571,sci.space_60960,sci.space,14,jgreen trumpet calpoly james thomas green keep...
3143,comp.sys.mac.hardware_51712,comp.sys.mac.hardware,4,ebodin pearl tufts screen death mac plus 512 o...


In [3]:
X = df['text']
y = df['class_number']

In [4]:
def benchmark_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))
    return clf

## Running experiments with bag-of-words and widely-used classifiers.

In [5]:
cv = CountVectorizer()
# bag-of-words
X_transformed = cv.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.33, random_state=42)
print('Train size %s' % X_train.shape[0])
print('Test size %s' % X_test.shape[0])
print('Number of features %s' % X_test.shape[1])

Train size 12614
Test size 6214
Number of features 169616


In [6]:
bag_of_words_classifiers = [
    #('Naive Bayes', MultinomialNB()),
    #('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    #('5-NN', KNeighborsClassifier(n_neighbors=5)),
    #('2-NN', KNeighborsClassifier(n_neighbors=2)),
    #('1-NN', KNeighborsClassifier(n_neighbors=1)),
    #('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_train, y_train, X_test, y_test)

print('Naive Bayes 0.8604763437399421')
print('Logistic Regression 0.892018023817187')
print('5-NN 0.48728677180560026')
print('2-NN 0.4914708722240103')
print('1-NN 0.607981976182813')
print('Neural Network 500x250 0.9137431606050853')
print('Neural Network 1000x500 0.9076279369166399')

Naive Bayes 0.8604763437399421
Logistic Regression 0.892018023817187
5-NN 0.48728677180560026
2-NN 0.4914708722240103
1-NN 0.607981976182813
Neural Network 500x250 0.9137431606050853
Neural Network 1000x500 0.9076279369166399


## Running experiments with bag-of-words, feature selection and widely-used classifiers.

### Performing feature selection using interpretable models.

#### Deciding which is the best model to use as feature selector (LinearSVC vs RidgeClassifier vs AdaBoostClassifier).

In [7]:
#benchmark_classifier(LinearSVC(), X_train, y_train, X_test, y_test)
#benchmark_classifier(RidgeClassifier(), X_train, y_train, X_test, y_test)
#benchmark_classifier(AdaBoostClassifier(), X_train, y_train, X_test, y_test)

# best LinearSVC
print('LinearSVC 0.8896041197296427')
print('RidgeClassifier 0.8606372706791117')
print('AdaBoostClassifier 0.5413582233665916')

LinearSVC 0.8896041197296427
RidgeClassifier 0.8606372706791117
AdaBoostClassifier 0.5413582233665916


In [8]:
selected_features = SelectFromModel(estimator=LinearSVC()).fit_transform(X_transformed, y)



In [9]:
X_selected_train, X_selected_test, y_selected_train, y_selected_test = train_test_split(selected_features, y, test_size=0.33, random_state=42)
print('Train size %s' % X_selected_train.shape[0])
print('Test size %s' % X_selected_test.shape[0])
print('Number of features %s' % X_selected_test.shape[1])

Train size 12614
Test size 6214
Number of features 32496


In [10]:
bag_of_words_selected_classifiers = [
    #('Naive Bayes', MultinomialNB()),
    #('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    #('5-NN', KNeighborsClassifier(n_neighbors=5)),
    #('2-NN', KNeighborsClassifier(n_neighbors=2)),
    #('1-NN', KNeighborsClassifier(n_neighbors=1)),
    #('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_selected_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_selected_train, y_selected_train, X_selected_test, y_selected_test)

print('Naive Bayes 0.8706147409076279')
print('Logistic Regression 0.8944319279047312')
print('5-NN 0.505471515931767')
print('2-NN 0.5125523012552301')
print('1-NN 0.6226263276472481')
print('Neural Network 100x50 0.9159961377534599')
print('Neural Network 500x250 ')
print('Neural Network 1000x500 ')

Naive Bayes 0.8706147409076279
Logistic Regression 0.8944319279047312
5-NN 0.505471515931767
2-NN 0.5125523012552301
1-NN 0.6226263276472481
Neural Network 100x50 0.9159961377534599
Neural Network 500x250 
Neural Network 1000x500 


### Performing feature selection by removing features with low variance.

In [33]:
sel = VarianceThreshold(threshold=0.8)

In [34]:
selected_variance_features = sel.fit_transform(X_transformed)
X_selected_variance_train, X_selected_variance_test, y_selected_variance_train, y_selected_variance_test = train_test_split(selected_variance_features, y, test_size=0.33, random_state=42)
print('Train size %s' % X_selected_variance_train.shape[0])
print('Test size %s' % X_selected_variance_test.shape[0])
print('Number of features %s' % X_selected_variance_test.shape[1])

Train size 12614
Test size 6214
Number of features 106


In [35]:
bag_of_words_selected_variance_classifiers = [
    ('Naive Bayes', MultinomialNB()),
    #('Logistic Regression', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')),
    #('5-NN', KNeighborsClassifier(n_neighbors=5)),
    #('2-NN', KNeighborsClassifier(n_neighbors=2)),
    #('1-NN', KNeighborsClassifier(n_neighbors=1)),
    #('Neural Network 100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)),
    #('Neural Network 500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)),
    #('Neural Network 1000x500', MLPClassifier(solver='adam', hidden_layer_sizes=(1000, 500), random_state=42)),
]

for classifier in bag_of_words_selected_variance_classifiers:
    print(classifier[0])
    benchmark_classifier(classifier[1], X_selected_variance_train, y_selected_variance_train, X_selected_variance_test, y_selected_variance_test)

Naive Bayes
0.33118764081107177


## Running experiments with graph-of-words.

## Running experiments with graph-of-docs.

## Discussion