In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from pprint import pprint
from time import time
import logging
from sklearn.pipeline import Pipeline, FeatureUnion
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from sklearn import metrics



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abhis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [3]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

In [5]:
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class Word2VecWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, size=100, window=5, min_count=1, workers=4):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        self.model = Word2Vec(X, vector_size=self.size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self

    def transform(self, X):
        # Return the average word vector for each document
        return np.array([
            np.mean([self.model.wv[word] for word in words if word in self.model.wv]
                    or [np.zeros(self.size)], axis=0)
            for words in X
        ])

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

    def get_params(self, deep=True):
        return {"size": self.size, "window": self.window, "min_count": self.min_count, "workers": self.workers}

class Doc2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        tagged_data = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(X)]
        self.model = Doc2Vec(tagged_data, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self

    def transform(self, X):
        return np.array([self.model.infer_vector(doc.split()) for doc in X])


In [6]:
def tokenize_text(text):
    return word_tokenize(text.lower())
feature_extractors = [
    ('CountVectorizer', CountVectorizer()),
    # ('Word2Vec',  Word2VecWrapper()),
    # ('Doc2Vec', Doc2Vec(vector_size=50, min_count=2, epochs=40))
]
classifiers = [
    ('Multinomial Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('Decision Tree', DecisionTreeClassifier())
]
results_table = pd.DataFrame(columns=['Feature Extractor', 'Classifier', 'Accuracy', 'Best Params'])

In [7]:
# Define parameter grids for each pipeline
param_grids = {
    'Multinomial Naive Bayes': [{
        'vect__ngram_range': [(1, 1), (1, 2)],
        'clf__alpha': [1.0, 0.1, 0.01]
    }],
    'Logistic Regression': [{
        'vect__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10]
    }],
    'Support Vector Machine': [{
        'vect__ngram_range': [(1, 1), (1, 2)],
        'clf__C': [0.1, 1, 10]
    }],
    'Decision Tree': [{
        'vect__ngram_range': [(1, 1), (1, 2)],
        'clf__max_depth': [10, 20, None],
        'clf__min_samples_split': [2, 5, 10]
    }]
}

In [8]:
for extractor_name, extractor_model in feature_extractors:
    for clf_name, clf_model in classifiers:
        pipelines = Pipeline([
            ('vect', extractor_model),
            ('clf', clf_model),
        ])
        grid_search = GridSearchCV(pipelines, param_grids[clf_name], cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(data.data, data.target)
        print(f"Best score for {clf_name}: {grid_search.best_score_:.3f}")
        print("Best parameters set:")
        
        


        # Iterate over the parameter names specific to the current model's grid
        # for param_name in param_grids[clf_name]:
        #     # Corrected access to the best parameters
        #     print(f"\t{param_name}: {best_parameters[param_name]}")


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best score for Multinomial Naive Bayes: 0.946
Best parameters set:
Fitting 5 folds for each of 6 candidates, totalling 30 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best score for Logistic Regression: 0.945
Best parameters set:
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best score for Support Vector Machine: 0.938
Best parameters set:
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best score for Decision Tree: 0.891
Best parameters set:
