# 20N Newsgroup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from xml.dom import minidom
import os, nltk, re
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim import corpora
from gensim import models
from gensim import similarities
from smart_open import smart_open
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
characters_to_remove = '!()#@~,."><*=-'
pattern = "[" + characters_to_remove + "]"
p = PorterStemmer()
tokenizer = nltk.RegexpTokenizer(r'\w+')
freq_to_remove = 1

In [None]:
def process(p, tokenizer, text):
    """ Applies standard pre-processing to given text.
    
    Args:
        p (gensim.parsing.porter.PorterStemmer): stemmer object.
        tokenizer (nltk.tokenize.regexp.RegexpTokenizer): tokenizr object.
        text (str): text to preprocess.
    
    Returns:
        list: preprocessed text.
    
    """
    # Converts to lowercase
    doc_nor = text.lower()
    
    # Removes stopwords
    doc_sw = remove_stopwords(doc_nor)
    
    # Stems text
    doc_stem = p.stem_sentence(doc_sw)
    
    # Lemmatizes text
    # TODO: Lemmatizer
    
    # Returns preprocessed text
    return tokenizer.tokenize(doc_stem)

In [None]:
categories = os.listdir('/content/drive/MyDrive/data/20news')
category_index = {}
for i, cat in enumerate(categories):
    d = {cat: i}
    category_index.update(d)
print(category_index)

{'talk.politics.mideast': 0, 'rec.motorcycles': 1, 'rec.autos': 2, 'comp.windows.x': 3, 'comp.sys.mac.hardware': 4, 'misc.forsale': 5, 'comp.graphics': 6, 'rec.sport.baseball': 7, 'rec.sport.hockey': 8, 'sci.electronics': 9, 'talk.politics.guns': 10, 'talk.religion.misc': 11, 'comp.os.ms-windows.misc': 12, 'sci.space': 13, 'sci.crypt': 14, 'comp.sys.ibm.pc.hardware': 15, 'soc.religion.christian': 16, 'talk.politics.misc': 17, 'sci.med': 18, 'alt.atheism': 19}


In [None]:
listed_text = []
listed_categories = []
for category in categories:
    files = os.listdir('/content/drive/MyDrive/data/20news/' + category)
    for file in files:
        doc = open('/content/drive/MyDrive/data/20news/' + category + '/' + file, encoding = 'ISO-8859-1',mode='r')
        text = re.sub(pattern, "", doc.read().replace('\n', '').replace('  ', ''))
        listed_text.append(process(p, tokenizer, text))
        listed_categories.append(category)
        doc.close()

In [None]:
dictionary = corpora.Dictionary(listed_text)
dictionary.filter_extremes(no_below=freq_to_remove)
dictionary.save('/content/drive/MyDrive/resources/20news/vocab20news.dict')
doc_corpus = []
for doc in listed_text:
    doc_corpus.append(dictionary.doc2bow(doc))
print('Dictionary length: ' + str(len(dictionary)))

In [None]:
bool_bow = np.zeros((len(doc_corpus), len(dictionary) + 1), dtype=np.int8)
bow = np.zeros((len(doc_corpus), len(dictionary) + 1), dtype=np.int8)
for index, doc in enumerate(doc_corpus):
    bool_bow[index, -1] = category_index[listed_categories[index]]
    bow[index, -1] = category_index[listed_categories[index]]
    for item in doc:
        bool_bow[index, item[0]] = 1
        bow[index, item[0]] = item[1]
np.save('/content/drive/MyDrive/resources/20news/bool_bow_matrix.npy', bool_bow)
np.save('/content/drive/MyDrive/resources/20news/bow_matrix.npy', bow)

In [None]:
def data_split(X, y,train_size=0.6, val_size=0.1, test_size=0.3):
    if not(train_size + val_size + test_size == 1):
        raise Exception('Sizes must add up to exactely 1.0')
    X_train, X_val, y_train,y_val = train_test_split(X, y, train_size = train_size, random_state = 15)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, train_size = val_size/(val_size+test_size),
                                                                                        random_state=15)
    return X_train, X_val, X_test, y_train, y_val, y_test

def train_validate_evaluate(classifier, X_train, X_val,y_train, y_val, feature):
    """ Trains and evaluates specified classifier
    
    Args:
        classifier (str): initials of classifier
        dataset (str): category to be trained on
        feature (str): file to use as training data
        
    Returns:
        'pandas.dataframe': Dataframe containing metrics for each classifier    
    """
    if classifier == 'NB':
        clf = GaussianNB()
    elif classifier == 'LR':
        clf = LogisticRegression(random_state=0, max_iter=500)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    accuracy = metrics.accuracy_score(y_val, y_pred)
    precision = metrics.precision_score(y_val, y_pred, average='macro')
    recall = metrics.recall_score(y_val, y_pred, average = 'macro')
    f1_score = metrics.f1_score(y_val, y_pred,average='macro')
    metrics_data = {'classifier': [classifier], 'model':[feature],
        'accuracy': [accuracy], 'precision':[precision], 'recall':[recall], 'f1_score': [f1_score]}
    df = pd.DataFrame(data = metrics_data)
    df.index = [classifier + ' ' + feature]
    return clf, df

# Pending train for each classifier and custom feature extraction

In [None]:
X_train = np.load('/content/drive/MyDrive/data/bow_matrix.npy')
metrics_df = pd.DataFrame()
clfs = []
X_train, X_val, X_test, y_train, y_val, y_test = data_split(X_train[:,:-1], X_train[:,-1])
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred, average='macro')
recall = metrics.recall_score(y_val, y_pred, average = 'macro')
f1_score = metrics.f1_score(y_val, y_pred,average='macro')
metrics_data = {'classifier': ['NB'], 'model':['bow'],
    'accuracy': [accuracy], 'precision':[precision], 'recall':[recall], 'f1_score': [f1_score]}
df = pd.DataFrame(data = metrics_data)
metrics_df = pd.concat([metrics_df, df], axis = 1)
clfs.append(clf)

In [None]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred, average='macro')

recall = metrics.recall_score(y_val, y_pred, average = 'macro')
f1_score = metrics.f1_score(y_val, y_pred,average='macro')
metrics_data = {'classifier': ['NB'], 'model':['bow'],
    'accuracy': [accuracy], 'precision':[precision], 'recall':[recall], 'f1_score': [f1_score]}
df = pd.DataFrame(data = metrics_data)
metrics_df = pd.concat([metrics_df, df], axis = 1)

clfs.append(clf)

In [None]:
X_train = np.load('/content/drive/MyDrive/data/bool_bow_matrix.npy')
X_train, X_val, X_test, y_train, y_val, y_test = data_split(X_train[:,:-1], X_train[:,-1])
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred, average='macro')
recall = metrics.recall_score(y_val, y_pred, average = 'macro')
f1_score = metrics.f1_score(y_val, y_pred,average='macro')
metrics_data = {'classifier': ['NB'], 'model':['bool_bow'],
    'accuracy': [accuracy], 'precision':[precision], 'recall':[recall], 'f1_score': [f1_score]}
df = pd.DataFrame(data = metrics_data)
metrics_df = pd.concat([metrics_df, df], axis = 1)
clfs.append(clf)

In [None]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_pred)
precision = metrics.precision_score(y_val, y_pred, average='macro')
recall = metrics.recall_score(y_val, y_pred, average = 'macro')
f1_score = metrics.f1_score(y_val, y_pred,average='macro')
metrics_data = {'classifier': ['NB'], 'model':['bool_bow'],
    'accuracy': [accuracy], 'precision':[precision], 'recall':[recall], 'f1_score': [f1_score]}
df = pd.DataFrame(data = metrics_data)
metrics_df = pd.concat([metrics_df, df], axis = 1)
clfs.append(clf)