In [1]:
import itertools
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import(
    datasets, feature_extraction, model_selection, 
    pipeline, naive_bayes, metrics,
)
import matplotlib.pyplot as plt

In [2]:
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    stop_words = nltk.corpus.stopwords.words('english')
    count_vectorizer = feature_extraction.text.CountVectorizer(
        tokenizer=nltk.word_tokenize,
        stop_words=stop_words,
        min_df=2, # The word must appear more than once
        # The higher ngram range is, the higher the vector space and computing cost
        ngram_range=(1, 2), # Allows for 1 and 2 word combinations
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    # Can change params of TfidfTransformer if it lowers performance
    processed_corpus = feature_extraction.text.TfidfTransformer(
        ).fit_transform(processed_corpus)
    
    return processed_corpus

In [3]:
data_filepath = '20_news_groups/20_newsgroups'

newsgroups_data = datasets.load_files(
    data_filepath, shuffle=True, random_state=42, encoding='ISO-8859-1')
print(f'{len(newsgroups_data.data)} files loaded.')
print(f'They contain the following classes: {newsgroups_data.target_names}')
print()
print(newsgroups_data.data[0])

19997 files loaded.
They contain the following classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Newsgroups: rec.sport.hockey
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!fs7.ece.cmu.edu!europa.eng.gtefsd.com!howland.reston.ans.net!zaphod.mps.ohio-state.edu!uwm.edu!cs.utexas.edu!utnut!alchemy.chem.utoronto.ca!golchowy
From: golchowy@alchemy.chem.utoronto.ca (Gerald Olchowy)
Subject: Re: RUMOUR - Keenan signs with Rangers?
Message-ID: <1993Apr16.222232.17393@alchemy.chem.utoronto.ca>
Organization: University of Toronto Chemistry Department
References: <1993Apr16.171347.784@news.columbia.edu> <1993Apr16.183110.838@alchemy.chem.u

In [4]:
def test_validate_train_split(X, y,
                              test_size: float = None,
                              random_state: int = None, **kwargs
                              ) -> tuple:
    '''
        Completes Sci-Kit Learn's test_train_split twice to split data into
        three sections
        
        Paramaters:
        X: The dataset without the target present
        y: The target values of the dataset
        test_size: The proportion of the data set in the test set.
            It is also the proportion of the remainder used for the validation set
        random_state: Controls the shuffling 
        **kwargs: These are passed to both test_train_split functions
        
        Returns a tuple of:
            X_train, X_validate, X_test, y_train, y_validate, y_test
    '''
    # Complete test_train split
    X_train_val, X_test, y_train_val, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, **kwargs)

    # Complete the test_validation split
    X_train, X_validate, y_train, y_validate = train_test_split(
        X_train_val, y_train_val, test_size=test_size,
        random_state=random_state, **kwargs)
    
    return X_train, X_validate, X_test, y_train, y_validate, y_test

In [5]:
# Complete a test_validate_train split
X_train, X_val, X_test, y_train, y_val, y_test = test_validate_train_split(
    newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)

In [6]:
# Read in the stop words
stop_words = nltk.corpus.stopwords.words('english')
# Given out tokenizer there is an issue with contracted words from our stop_words
# you'd is in out stop_words, but is tokenised to ["you", "'d"]
# which would be missed, so run our stop words through the tokenizer to match
nested_tokenized_stop_words = [nltk.word_tokenize(stop_word) for stop_word in stop_words]
# This results in a list of lists which need to be flattened
tokenized_stop_words = [word
                        # for each list in the bigger list we want
                        for list_of_words in nested_tokenized_stop_words
                        # each word in the list
                        for word in list_of_words]

In [10]:
# Create the model pipeline
# Pipeline accepts a list of tuples, a name of a step, and a step
model = Pipeline([
    ('counts', feature_extraction.text.CountVectorizer(
        tokenizer=nltk.word_tokenize,
        min_df=2,
        ngram_range=(1,2),
        stop_words=tokenized_stop_words,
        token_pattern=None, # This is needed to stop warning
    )),
    ('tfidf', feature_extraction.text.TfidfTransformer()),
    ('naivebayes', naive_bayes.MultinomialNB()),
])

In [11]:
# Fit the model
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(f'Accuracy of multinomial naive bayes = {np.mean(y_pred==y_val)}')

Accuracy of multinomial naive bayes = 0.8865625


In [14]:
# Aee classification report
print(metrics.classification_report(y_val, y_pred, target_names=newsgroups_data.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.81      0.75      0.78       166
           comp.graphics       0.94      0.78      0.86       189
 comp.os.ms-windows.misc       0.84      0.90      0.87       149
comp.sys.ibm.pc.hardware       0.84      0.86      0.85       156
   comp.sys.mac.hardware       0.89      0.93      0.91       152
          comp.windows.x       0.77      0.92      0.84       142
            misc.forsale       0.82      0.85      0.83       158
               rec.autos       0.98      0.90      0.94       168
         rec.motorcycles       0.97      0.93      0.95       163
      rec.sport.baseball       0.99      0.91      0.95       153
        rec.sport.hockey       0.93      0.98      0.95       166
               sci.crypt       0.93      0.98      0.95       173
         sci.electronics       0.94      0.87      0.91       156
                 sci.med       0.96      0.88      0.92       170
         

In [17]:
# Perform a gridsearch
grid_search_model = model_selection.GridSearchCV(
    model,
    {
        'counts__ngram_range': [(1,1), (1,2)],
        'naivebayes__alpha': (0.1, 3.0),
    },
    n_jobs=-1 # Determins how many cores are installed and uses them all
)

In [18]:
grid_search_model.fit(X_train, y_train)

In [20]:
grid_search_model.cv_results_

{'mean_fit_time': array([23.93198729, 28.45220838, 35.43191319, 34.91831837]),
 'std_fit_time': array([2.32528936, 1.66499682, 1.49796747, 0.60804837]),
 'mean_score_time': array([7.98953762, 6.79919391, 6.62329254, 5.73562279]),
 'std_score_time': array([0.58800561, 0.47493752, 0.2865435 , 0.4000883 ]),
 'param_counts__ngram_range': masked_array(data=[(1, 1), (1, 1), (1, 2), (1, 2)],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_naivebayes__alpha': masked_array(data=[0.1, 3.0, 0.1, 3.0],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'counts__ngram_range': (1, 1), 'naivebayes__alpha': 0.1},
  {'counts__ngram_range': (1, 1), 'naivebayes__alpha': 3.0},
  {'counts__ngram_range': (1, 2), 'naivebayes__alpha': 0.1},
  {'counts__ngram_range': (1, 2), 'naivebayes__alpha': 3.0}],
 'split0_test_score': array([0.8890625 , 0.8703125 , 0.88789063, 0.86835938]),
 'split1

In [21]:
grid_search_model.best_params_

{'counts__ngram_range': (1, 1), 'naivebayes__alpha': 0.1}