In [13]:
%config IPCompleter.greedy=True

In [14]:
import numpy as np
from xgboost import XGBClassifier
import pickle
import nltk
from nltk.stem.snowball import SnowballStemmer

from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV

In [15]:
def load_model(file):
  with open(file, 'rb') as f:
    while True:
      try:
        model = pickle.load(f)
        print('Model was loaded successfully!')
      except EOFError:
        break
      
  return model

In [16]:
def run_tests(model, dataset):
  predicted = model.predict(dataset.data)
  return np.mean(predicted == dataset.target)

In [17]:
# download required datasets

train_set = fetch_20newsgroups(subset='train', shuffle=True)
test_set = fetch_20newsgroups(subset='test', shuffle=True)

In [18]:
# download data which will be used for stemming

nltk.download('snowball_data')
nltk.download('stopwords')

[nltk_data] Downloading package snowball_data to
[nltk_data]     /home/jakub/nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jakub/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# create stemming based CountVectorized

class StemmedCountVectorizer(CountVectorizer):
  def build(self):
    analyzer = super(StemmedCountVectorizer, self).build_analyzer()
    return lambda text: ([stemmer.stem(word) for word in anlayzer(text)])

In [20]:
stemmer = SnowballStemmer('english', ignore_stopwords=True)
stemmed_count_vector = StemmedCountVectorizer(stop_words='english')

# Random forest

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
pipe_rt = Pipeline([
  ('vect', stemmed_count_vector),
  ('tfidf', TfidfTransformer()),
  ('dtc', RandomForestClassifier())
])

pipe_rt.set_params(
    vect__ngram_range=(1,2),
    tfidf__use_idf=True,
    dtc__n_estimators=100,
    dtc__max_depth=100, 
    dtc__class_weight='balanced'
)

Pipeline(steps=[('vect',
                 StemmedCountVectorizer(ngram_range=(1, 2),
                                        stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('dtc',
                 RandomForestClassifier(class_weight='balanced',
                                        max_depth=100))])

In [25]:
pipe_rt.fit(train_set.data, train_set.target)

Pipeline(steps=[('vect',
                 StemmedCountVectorizer(ngram_range=(1, 2),
                                        stop_words='english')),
                ('tfidf', TfidfTransformer()),
                ('dtc',
                 RandomForestClassifier(class_weight='balanced',
                                        max_depth=100))])

In [26]:
predicted_rt = pipe_rt.predict(test_set.data)

print('Accuracy after stemming: %s' % np.mean(predicted_rt == test_set.target))

Accuracy after stemming: 0.7831917153478491
