In [None]:
nltk.download('reuters')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import reuters
import numpy as np
import pandas as pd

Documents = [reuters.raw(fid) for fid in reuters.fileids()]

# Categories are list of lists since each news may have more than 1 category
Categories = [reuters.categories(fid) for fid in reuters.fileids()]
CategoriesList = [_ for sublist in Categories for _ in sublist]
CategoriesSet = np.unique(CategoriesList)

print(f'N documents= {len(Documents):d}, K unique categories= {len(CategoriesSet):d}')

In [None]:
from collections import Counter

# Check the categories and their counts
counts = Counter(CategoriesList)
counts = sorted(counts.items(), key=lambda pair: pair[1], reverse=True)

print(counts[:10])

In [None]:
# Build the news category list
yCategories = [_[0] for _ in counts[:5]]
yCategories += ['other']

# Sanity check
print(f'K categories for classification= {len(yCategories):d} categories, {yCategories}')

In [None]:
# Assign a category for each news text, including 'other'
yCat = []
for cat in Categories:
    bFound = False
    for _ in yCategories:
        if _ in cat:
            yCat += [_]
            bFound = True
            break  # So we add only one category for a news text
    if not bFound:
        yCat += ['other']
        
# Sanity check
print(f'N target categories={len(yCat):d}')

In [None]:
# Convert to numerical np.array which sklearn requires
ydocs = np.array([yCategories.index(_) for _ in yCat])

In [None]:
# StratifiedKFold will require indexable data structure
Docs = pd.Series(Documents)
Categories = pd.Series(yCat)

# Sanity check
print(Categories[0],'-->',Docs[0][:150],'\n',
      Categories[1],'-->',Docs[1][:150],'\n',
      Categories[2],'-->',Docs[2][:150])

# Size of the problem
print(f'N={len(Docs)} documents')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def kfold_eval_docs(_clf, _Xdocs, _ydocs):
    # Need indexable data structure
    accuracies = []
    kf = StratifiedKFold(n_splits=10, shuffle=False)
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        y_pred = _clf.predict(_Xdocs[test_index])
        accuracies += [accuracy_score(_ydocs[test_index], y_pred)]

    return np.array(accuracies)

In [None]:
# Check the size of the dataset matrix X for this Tf-Idf feature extraction - raw number of features
from sklearn.feature_extraction.text import TfidfVectorizer

X_tfidf = TfidfVectorizer().fit_transform(Documents)
print(f'N data points= {X_tfidf.shape[0]}, M features= {X_tfidf.shape[1]}')

In [None]:
print(Docs.items(),Categories)

In [None]:
%%time
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def kfold_eval_docs(_clf, _Xdocs, _ydocs):
    # Need indexable data structure
    accuracies = []
    kf = StratifiedKFold(n_splits=10, shuffle=False)
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        y_pred = _clf.predict(_Xdocs[test_index])
        accuracies += [accuracy_score(_ydocs[test_index], y_pred)]

    return np.array(accuracies)

N_FEATURES=1
from sklearn.svm import LinearSVC

svm_lin = Pipeline([('clf', LinearSVC(class_weight='balanced')),('tfidf',TfidfTransformer()),
                   ])
acc = kfold_eval_docs(svm_lin, Docs, Categories)
print(f'Support Vector Machine (linear SVC) CV accuracy={np.mean(acc):.3f} {np.std(acc):.3f}')

In [None]:
N_FEATURES=1000