In [1]:
import nltk
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from itertools import chain

In [2]:
# Load in data
data_filepath = '20_news_groups/20_newsgroups'

newsgroups_data = load_files(
    data_filepath, shuffle=True, random_state=42, encoding='ISO-8859-1')
print(f'{len(newsgroups_data.data)} files loaded.')
print('They contain the following classes:')
newsgroups_data.target_names

19997 files loaded.
They contain the following classes:


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# Complete a test_validate_train split
X_train, X_test, y_train, y_test = train_test_split(
    newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)

In [5]:
# Read in the stop words
stop_words = nltk.corpus.stopwords.words('english')
# Given out tokenizer there is an issue with contracted words from our stop_words
# you'd is in out stop_words, but is tokenised to ["you", "'d"]
# which would be missed, so run our stop words through the tokenizer to match
tokenized_stop_words = list(chain(
    *[nltk.word_tokenize(stop_word) for stop_word in stop_words]))
 

In [6]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(
        tokenizer=nltk.word_tokenize,
        min_df=2,
        ngram_range=(1,2),
        stop_words=tokenized_stop_words,
        token_pattern=None,
        norm='l2'
    )),
    ('clf', SGDClassifier(
        alpha=0.0001, penalty='l1')),
])

In [7]:
pipe.fit(X_train, y_train)

In [10]:
pipe['clf'].coef_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])