In [None]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
# Categories
categories = ['comp.graphics','comp.os.ms-windows.misc','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','comp.windows.x',
              'rec.autos','rec.motorcycles','rec.sport.baseball','rec.sport.hockey',
              'sci.crypt','sci.electronics','sci.med','sci.space',
              'misc.forsale','talk.politics.misc','talk.politics.guns','talk.politics.mideast',
              'talk.religion.misc','alt.atheism','soc.religion.christian']

# Load train and test data 
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)

In [None]:
def printResults(classifier_name, predicted_data, test_data):
    print(f"The {classifier_name} classifier was able to recognize the test set with this accuracy: "+ str(np.mean(predicted_data == test_data.target)))
    
    print(f"Metrics Results for {classifier_name}")
    print(metrics.classification_report(test_data.target, predicted_data,target_names=test_data.target_names))
    
    conf_matrix = metrics.confusion_matrix(test_data.target, predicted_data)
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='medium')
 
    plt.xlabel('Predictions', fontsize=9)
    plt.ylabel('Actuals', fontsize=9)
    plt.title(f'Confusion Matrix for {classifier_name}', fontsize=9)
    plt.show()
    

In [None]:
def comparePlainClassifiers(train_data, test_data):
    clfByaes = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()),
        ])
    clfByaes.fit(train_data.data,train_data.target)
    printResults("MultinomialNB",clfByaes.predict(test_data.data), test_data)

    clfSGD = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
        ])
    clfSGD.fit(train_data.data,train_data.target)
    printResults("SGDClassifier",clfSGD.predict(test_data.data), test_data)

    clfRidgeClassifier = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RidgeClassifier()),
        ])
    clfRidgeClassifier.fit(train_data.data,train_data.target)
    printResults("RidgeClassifier", clfRidgeClassifier.predict(test_data.data), test_data)

comparePlainClassifiers(twenty_train,twenty_test)

In [None]:
def compareFeatureClassifiers(train_data, test_data):
    classifiers = [
        ('MultinomialNB', MultinomialNB()),
        ('SGDClassifier', SGDClassifier()),
        ('RidgeClassifier', RidgeClassifier())
    ]
    vectorizers = [
        ('CountVectorizer', CountVectorizer()),
        ('TfidfVectorizer', TfidfVectorizer()),
        ('TF', TfidfVectorizer(use_idf=False))
    ]
    for vec_name, vectorizer in vectorizers:
        print(f"Feature Representation: {vec_name}")
        for clf_name, clf in classifiers:
            print(f"Classifier: {clf_name}")
            allpipeline = Pipeline([
                ('vect', vectorizer),
                ('tfidf', TfidfTransformer() if vec_name != 'TF' else None),
                ('clf', clf),
            ])
            allpipeline.fit(train_data.data, train_data.target)
            allpipeline.predict(test_data.data)
            target_names = test_data.target_names
            accuracy = accuracy_score(test_data.target, allpipeline.predict(test_data.data))
            classification_rep = classification_report(test_data.target, allpipeline.predict(test_data.data), target_names=target_names)
            printResults(clf_name, allpipeline.predict(test_data.data), test_data)
            # print(f"Accuracy: {accuracy}","Classification Report:\n", classification_rep)

compareFeatureClassifiers(twenty_train,twenty_test)

In [80]:
def CompareVectorizerFeatures(train_data, test_data):
    parameter_grid = {
    "vect__lowercase": (True,False),
    "vect__stop_words": ('english', None),
    "vect__analyzer": ('word','char_wb'),  
    "vect__max_features":[10,20,50],
    "clf__alpha": [0.1, 0.1, 0.1],
    "clf__penalty": ['l2', 'l1']
    }

    clfSGD = Pipeline(steps=[
        ('vect', TfidfVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
        ])
    clfSGD.fit(train_data.data,train_data.target)
    
    from sklearn.model_selection import RandomizedSearchCV
    from pprint import pprint

    random_search = RandomizedSearchCV(
        estimator=clfSGD,
        param_distributions=parameter_grid,
        n_iter=100,
        random_state=42,
        n_jobs=-1,
        verbose=1,
    )

    print("Performing grid search...")
    print("Hyperparameters to be evaluated:")
    pprint(parameter_grid)
    from time import time

    t0 = time()
    random_search.fit(twenty_train.data, twenty_train.target)
    print(f"Done in {time() - t0:.3f}s")

    classification_rep = metrics.classification_report(test_data.target, clfSGD.predict(test_data.data), target_names=test_data.target_names, zero_division=1)

    print("Classification Report:")
    print(classification_rep)

    print("Best parameters combination found:")
    best_parameters = random_search.best_estimator_.get_params()
    for param_name in sorted(parameter_grid.keys()):
        print(f"{param_name}: {best_parameters[param_name]}")

    test_accuracy = random_search.score(twenty_test.data, twenty_test.target)
    print(
        "Accuracy of the best parameters using the inner CV of "
        f"the random search: {random_search.best_score_:.3f}"
    )
    print(f"Accuracy on test set: {test_accuracy:.3f}")

CompareVectorizerFeatures(twenty_train,twenty_test)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/gio/Desktop/LeidenUni/Text_mining/Assigment1/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/__/zhw3c12n75bf1nhs96_vydlw0000gn/T/ipykernel_29288/3740442320.py", line 56, in <module>
    CompareVectorizerFeatures(twenty_train,twenty_test)
  File "/var/folders/__/zhw3c12n75bf1nhs96_vydlw0000gn/T/ipykernel_29288/3740442320.py", line 16, in CompareVectorizerFeatures
    clfSGD.fit(train_data.data,train_data.target)
  File "/Users/gio/Desktop/LeidenUni/Text_mining/Assigment1/venv/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/gio/Desktop/LeidenUni/Text_mining/Assigment1/venv/lib/python3.11/site-packages/sklearn/pipeline.py", line 423, in fit
    Xt = self._fit(X, y, **fit_params_steps)
    