In [1]:
from bs4 import BeautifulSoup
import os
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
filepaths = []
for root, dirs, files in os.walk(os.getcwd() + "/reuters21578/"):
    for file in files:
        if os.path.splitext(file)[1] == '.sgm':
            filepaths.append(os.path.join(root, file))


file_list = [open(file, 'r', encoding='ISO-8859-1') for file in filepaths]
soup_list = [BeautifulSoup(file,'lxml') for file in file_list]

In [3]:
def find_topics(soup):
    tuple_topics = [(topic.parent.get('newid'),i) for topic in soup.find_all('topics') for i in topic.strings]
    return tuple_topics

def find_texts(soup):
    dic_text = {find.parent.get('newid'):find.text.replace(find.title.string if find.parent.title is not None else "","").replace(find.dateline.string if find.dateline is not None else "","").replace("\n","") for find in soup.find_all('text') if find.parent.topics.contents!=[]}
    return dic_text

def get_strs(soup):
    topics = find_topics(soup)
    text = find_texts(soup)
    strs = [topic[1] + "_label_" + text.get(topic[0]) for topic in topics]
    return strs

def write_to_txt(strs):
    file = open('raw_y_X.txt','w',encoding='utf-8')
    for i in strs:
        file.write(i+'\n')
    file.close()

In [4]:
strs_s = []
for soup in soup_list:
    strs = get_strs(soup)
    for st in strs:
        strs_s.append(st)

random.shuffle(strs_s)
write_to_txt(strs_s)

In [5]:
X_raw = []
y_raw = []
with open("raw_y_X.txt", "r") as infile:
    lines = infile.readlines()
    for line in lines:
        y_raw.append(line.split("_label_")[0])
        X_raw.append(line.split("_label_")[1])

In [17]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words="english")

##################20newsgroups########################

newsgroups_train = fetch_20newsgroups(subset="train")
X_news = vectorizer.fit_transform(newsgroups_train.data)
y_news = newsgroups_train.target

##################Reuters###############################

X_reuters = vectorizer.fit_transform(X_raw)
label_encoder = LabelEncoder()
y_reuters = label_encoder.fit_transform(y_raw)

In [12]:
X_news_train, X_news_test, y_news_train, y_news_test = train_test_split(X_news, y_news, test_size=0.25)
lsvc_news = LinearSVC(loss="squared_hinge", penalty="l2", C=1, multi_class="ovr")
lsvc_news.fit(X_news_train, y_news_train)
print(classification_report(y_news_test, lsvc_news.predict(X_news_test)))


              precision    recall  f1-score   support

           0       0.94      0.97      0.95       128
           1       0.88      0.91      0.89       163
           2       0.88      0.92      0.90       137
           3       0.83      0.79      0.81       136
           4       0.92      0.92      0.92       135
           5       0.92      0.89      0.91       152
           6       0.82      0.90      0.86       146
           7       0.92      0.93      0.93       133
           8       0.96      0.96      0.96       163
           9       0.97      0.95      0.96       134
          10       0.93      0.97      0.95       116
          11       0.97      0.97      0.97       159
          12       0.91      0.90      0.91       139
          13       0.99      0.95      0.97       148
          14       0.96      0.97      0.97       158
          15       0.94      0.98      0.96       167
          16       0.94      0.97      0.96       138
          17       0.97    

In [13]:
X_reuters_train, X_reuters_test, y_reuters_train, y_reuters_test = train_test_split(X_reuters, y_reuters, test_size=0.25)
lsvc_reuters = LinearSVC(loss="squared_hinge", penalty="l2", C=1, multi_class="ovr")
lsvc_reuters.fit(X_reuters_train, y_reuters_train)
print(classification_report(y_reuters_test, lsvc_reuters.predict(X_reuters_test)))

              precision    recall  f1-score   support

           0       0.65      0.96      0.77       605
           1       0.73      0.57      0.64        14
           3       0.00      0.00      0.00         9
           5       0.35      0.18      0.24        33
           6       0.00      0.00      0.00         2
           7       0.14      0.19      0.16        16
          11       0.67      0.71      0.69        17
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         2
          14       0.83      0.72      0.77        40
          15       0.67      0.67      0.67        15
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00        64
          18       0.00      0.00      0.00         1
          20       0.23      0.23      0.23        13
          21       0.00      0.00      0.00         1
          23       0.56      0.38      0.45        26
          24       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
cnb_news = ComplementNB(alpha=1)
cnb_news.fit(X_news_train, y_news_train)
print(classification_report(y_news_test, cnb_news.predict(X_news_test)))

              precision    recall  f1-score   support

           0       0.89      0.98      0.93       128
           1       0.91      0.82      0.86       163
           2       0.84      0.86      0.85       137
           3       0.76      0.75      0.76       136
           4       0.94      0.89      0.92       135
           5       0.92      0.89      0.91       152
           6       0.88      0.81      0.84       146
           7       0.86      0.92      0.89       133
           8       0.95      0.95      0.95       163
           9       0.91      0.94      0.93       134
          10       0.86      0.97      0.91       116
          11       0.95      0.98      0.97       159
          12       0.91      0.83      0.87       139
          13       0.96      0.92      0.94       148
          14       0.93      0.97      0.95       158
          15       0.91      0.97      0.94       167
          16       0.87      0.97      0.92       138
          17       0.91    

In [15]:
cnb_reuters = ComplementNB(alpha=1)
cnb_reuters.fit(X_reuters_train, y_reuters_train)
print(classification_report(y_reuters_test, cnb_reuters.predict(X_reuters_test)))

              precision    recall  f1-score   support

           0       0.64      0.96      0.77       605
           1       0.88      0.50      0.64        14
           3       0.00      0.00      0.00         9
           5       0.38      0.09      0.15        33
           6       0.00      0.00      0.00         2
           7       0.07      0.06      0.06        16
          11       0.67      0.59      0.62        17
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         2
          14       0.84      0.68      0.75        40
          15       0.64      0.60      0.62        15
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00        64
          18       0.00      0.00      0.00         1
          20       0.25      0.08      0.12        13
          21       0.00      0.00      0.00         1
          23       0.38      0.35      0.36        26
          24       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Preprocessing**

I used BeautifulSoup to parse the Reuters data instead of Regular Expressions, which turned out to be more difficult than expected.
When parsing the data, only documents with the label \\<TOPICS\> were chosen, and texts were read in by stripping the title and dateline information.
The "newid" information makes the text and its topic match.
The X_raw would be all the texts, and the y_raw would be their corresponding topics.
The topic and text of every document were stored in a .txt file with a \_label_ mark separating them.

Then TF-IDF vectorizer helped with the encoding of both datasets with unigram and bigram features excluding the stop words. Label Encoder of sklearn was used to encode target labels with value between 0 and n_classes-1.

**Model Selection**

Two models were implemented, Linear Support Vector Classifier as the non-probabilistic one and Complement Naive Bayes as the probabilistic one. 
LinearSVC has more flexibility in the choice of penalties and loss functions, and Complement NB is suitable for imbalanced datasets, in our case, the Reuters dataset. The inductive bias of a SVM is that distinct classes tend to be separated by wide margins (maximum margin).
The naive bayes classifier assumes that the inputs are independent of each other, and the input only depends on the output label.

The train and test sets were split by sklearn. I tried several test sizes and found out that the performance did not vary a lot, and I went with 0.25.

For hyperparameters, I chose to use the default values for both classifiers.

**Evaluation**

The evaluation metric I chose was classification report in sklearn. It shows the accuracy, recall, and F1 score for each label and overall.
The results I got from the experiments showed that the 20 newsgroups dataset outperformed Reuters a lot on both classifiers.
The overall accuracy for 20 newsgroups dataset was about 0.9, while the overall accuracy for Reuters was around 0.6.
When classifying the Reuters dataset, labels with high frequency were predicted with higher accuracy and recall, whereas rare labels got nearly 0. 
I think it is because the Reuters dataset is not as well-formed as the other one. There's a lot of "noise" in the texts (e.g. many texts are like \*\*\*\*\*Blah Blah Blah).

Also, the 20 newsgroups dataset only has 20 labels whereas the Reuters dataset has 120 labels, making the classification task harder. With so many labels to classify, more input data is needed for the chosen model to learn.

As for the warining of zero division, there are zero accuracies and recalls so it is natural to see that.