# Assignment 1 Text Mining Emma Vonk, Julius Ruijgrok
## Question 1 The tutorial classifies between only four categories of the 20newsgroups data set. Change your script so that it addresses all 20 categories.

In [1]:
# Assignment 1 of text mining

# code from https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html

from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neural_network import MLPClassifier
import pandas as pd



def size_mb(docs):
    return sum(len(s.encode("utf-8")) for s in docs) / 1e6


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def load_dataset(vectorizer, verbose=False, remove=()):
    """Load and vectorize the 20 newsgroups dataset using the specified vectorizer."""
    
    data_train = fetch_20newsgroups(
        subset="train",
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    data_test = fetch_20newsgroups(
        subset="test",
        shuffle=True,
        random_state=42,
        remove=remove,
    )

    # Order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    # Split target in a training set and a test set
    y_train, y_test = data_train.target, data_test.target

    # Extracting features from the training data using the specified vectorizer
    t0 = time()
    X_train = vectorizer.fit_transform(data_train.data)
    duration_train = time() - t0

    # Extracting features from the test data using the same vectorizer
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration_test = time() - t0

    feature_names = vectorizer.get_feature_names_out()

    if verbose:
        # Compute size of loaded data
        data_train_size_mb = size_mb(data_train.data)
        data_test_size_mb = size_mb(data_test.data)

        print(f"{len(data_train.data)} documents - {data_train_size_mb:.2f}MB (training set)")
        print(f"{len(data_test.data)} documents - {data_test_size_mb:.2f}MB (test set)")
        print(f"{len(target_names)} categories")
        print(f"Vectorize training done in {duration_train:.3f}s at {data_train_size_mb / duration_train:.3f}MB/s")
        print(f"n_samples: {X_train.shape[0]}, n_features: {X_train.shape[1]}")
        print(f"Vectorize testing done in {duration_test:.3f}s at {data_test_size_mb / duration_test:.3f}MB/s")
        print(f"n_samples: {X_test.shape[0]}, n_features: {X_test.shape[1]}")

    return X_train, X_test, y_train, y_test, feature_names, target_names

In [2]:
# Different vectorizers to be used later
count_vectorizer = CountVectorizer()
tf_vectorizer = TfidfVectorizer(sublinear_tf=False, max_df=0.5, min_df=5, stop_words="english")
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english")

In [3]:
X_train, X_test, y_train, y_test, feature_names, target_names = load_dataset(tf_vectorizer, verbose=True)

11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.325s at 9.486MB/s
n_samples: 11314, n_features: 25631
Vectorize testing done in 1.337s at 10.320MB/s
n_samples: 7532, n_features: 25631


## Question 2 Compare three classifiers in sklearn on this multi-class classification task, including at least Naïve Bayes.

In [4]:
# Function to perform classification and print results
def classify_and_print_results(X_train, y_train, X_test, y_test, classifier_name):
    if classifier_name == "Naive Bayes":
        clf = MultinomialNB()
    elif classifier_name == "Logistic Regression":
        clf = LogisticRegression(random_state=0)
    elif classifier_name == "SVM":
        clf = svm.SVC()
    elif classifier_name == "MLP":
        clf = MLPClassifier(random_state=0, max_iter=300)
    
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_predict)
    precision = precision_score(y_test, y_predict, average='weighted')
    recall = recall_score(y_test, y_predict, average='weighted')
    f1 = f1_score(y_test, y_predict, average='weighted')
    
    print(f"{classifier_name} Accuracy: {accuracy:.4f}")
    print(f"{classifier_name} Precision: {precision:.4f}")
    print(f"{classifier_name} Recall: {recall:.4f}")
    print(f"{classifier_name} F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_predict))
    
    return accuracy, precision, recall, f1

In [5]:
# TF-vectorizer Naive Bayes Accuracy
TFNBAcc, TFNBPre, TFNBRe, TFNBF1 = classify_and_print_results(X_train, y_train, X_test, y_test, "Naive Bayes")

Naive Bayes Accuracy: 0.8173
Naive Bayes Precision: 0.8302
Naive Bayes Recall: 0.8173
Naive Bayes F1 Score: 0.8118
              precision    recall  f1-score   support

           0       0.81      0.70      0.75       319
           1       0.72      0.74      0.73       389
           2       0.77      0.72      0.74       394
           3       0.64      0.78      0.70       392
           4       0.85      0.81      0.83       385
           5       0.85      0.79      0.82       395
           6       0.84      0.82      0.83       390
           7       0.88      0.90      0.89       396
           8       0.93      0.94      0.94       398
           9       0.92      0.92      0.92       397
          10       0.89      0.98      0.93       399
          11       0.84      0.95      0.89       396
          12       0.82      0.66      0.73       393
          13       0.92      0.80      0.86       396
          14       0.84      0.93      0.88       394
          15       0

In [6]:
# TF-vectorizer Logistic Regression Accuracy
TFLRAcc, TFLRPre, TFLRRe, TFLRF1 = classify_and_print_results(X_train, y_train, X_test, y_test, "Logistic Regression")

Logistic Regression Accuracy: 0.8278
Logistic Regression Precision: 0.8305
Logistic Regression Recall: 0.8278
Logistic Regression F1 Score: 0.8262
              precision    recall  f1-score   support

           0       0.79      0.72      0.75       319
           1       0.70      0.80      0.75       389
           2       0.75      0.75      0.75       394
           3       0.70      0.73      0.72       392
           4       0.80      0.82      0.81       385
           5       0.84      0.74      0.79       395
           6       0.78      0.86      0.82       390
           7       0.89      0.88      0.89       396
           8       0.94      0.94      0.94       398
           9       0.90      0.92      0.91       397
          10       0.94      0.96      0.95       399
          11       0.95      0.90      0.92       396
          12       0.72      0.77      0.74       393
          13       0.88      0.85      0.87       396
          14       0.89      0.91      0.9

In [7]:
# TF-vectorizer Support Vector Machine Accuracy
TFSVMAcc, TFSVMPre, TFSVMRe, TFSVMF1 = classify_and_print_results(X_train, y_train, X_test, y_test, "SVM")

SVM Accuracy: 0.8210
SVM Precision: 0.8321
SVM Recall: 0.8210
SVM F1 Score: 0.8216
              precision    recall  f1-score   support

           0       0.84      0.70      0.76       319
           1       0.63      0.83      0.72       389
           2       0.79      0.72      0.75       394
           3       0.69      0.78      0.73       392
           4       0.83      0.82      0.83       385
           5       0.84      0.72      0.77       395
           6       0.76      0.89      0.82       390
           7       0.88      0.87      0.87       396
           8       0.97      0.92      0.94       398
           9       0.92      0.92      0.92       397
          10       0.96      0.94      0.95       399
          11       0.97      0.86      0.91       396
          12       0.63      0.82      0.71       393
          13       0.86      0.84      0.85       396
          14       0.91      0.87      0.89       394
          15       0.81      0.93      0.86       39

In [8]:
# TF-vectorizer Multilayer Perceptron Accuracy
TFMLPAcc, TFMLPPre, TFMLPRe, TFMLPF1 = classify_and_print_results(X_train, y_train, X_test, y_test, "MLP")

MLP Accuracy: 0.8475
MLP Precision: 0.8503
MLP Recall: 0.8475
MLP F1 Score: 0.8472
              precision    recall  f1-score   support

           0       0.84      0.77      0.81       319
           1       0.73      0.82      0.77       389
           2       0.77      0.72      0.74       394
           3       0.69      0.76      0.72       392
           4       0.80      0.85      0.83       385
           5       0.87      0.77      0.82       395
           6       0.79      0.89      0.84       390
           7       0.92      0.89      0.91       396
           8       0.95      0.95      0.95       398
           9       0.93      0.95      0.94       397
          10       0.97      0.97      0.97       399
          11       0.94      0.92      0.93       396
          12       0.78      0.76      0.77       393
          13       0.91      0.86      0.89       396
          14       0.92      0.91      0.91       394
          15       0.85      0.92      0.88       39

## Question 3 Compare three types of features for your classifiers: counts, tf, and tf-idf. Keep the best combination of a classifier and a feature type for the next task.

In [9]:
# The normal TF vectorizer was used in questions 1 and 2, below is the count vectroizer and tf-idf vectorizer
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, stop_words="english")

# Load datasets with different vectorizers
X_train_count, X_test_count, y_train_count, y_test_count, feature_names_count, target_names_count = load_dataset(count_vectorizer, verbose=True)
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, feature_names_tfidf, target_names_tfidf = load_dataset(tfidf_vectorizer, verbose=True)

11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.658s at 8.297MB/s
n_samples: 11314, n_features: 130107
Vectorize testing done in 1.329s at 10.383MB/s
n_samples: 7532, n_features: 130107
11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.318s at 9.514MB/s
n_samples: 11314, n_features: 25631
Vectorize testing done in 1.426s at 9.677MB/s
n_samples: 7532, n_features: 25631


In [10]:
print("Naive Bayes Counts:")
CNBAcc, CNBPre, CBNRe, CNBF1 = classify_and_print_results(X_train_count, y_train_count, X_test_count, y_test_count, "Naive Bayes")
print("Naive Bayes TF-IDF:")
IDFNBAcc, IDFNBPre, IDFNBRe, IDFNBF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "Naive Bayes")

Naive Bayes Counts:
Naive Bayes Accuracy: 0.7728
Naive Bayes Precision: 0.7617
Naive Bayes Recall: 0.7728
Naive Bayes F1 Score: 0.7511
              precision    recall  f1-score   support

           0       0.79      0.77      0.78       319
           1       0.67      0.74      0.70       389
           2       0.20      0.00      0.01       394
           3       0.56      0.77      0.65       392
           4       0.84      0.75      0.79       385
           5       0.65      0.84      0.73       395
           6       0.93      0.65      0.77       390
           7       0.87      0.91      0.89       396
           8       0.96      0.92      0.94       398
           9       0.96      0.87      0.91       397
          10       0.93      0.96      0.95       399
          11       0.67      0.95      0.78       396
          12       0.79      0.66      0.72       393
          13       0.87      0.82      0.85       396
          14       0.83      0.89      0.86       394


In [11]:
print("Logistic Regression Counts:")
CLRAcc, CLRPre, CLRRe, CLRF1 = classify_and_print_results(X_train_count, y_train_count, X_test_count, y_test_count, "Logistic Regression")
print("Logistic Regression TF-IDF:")
IDFLRAcc, IDFLRPre, IDFLRRe, IDFLRF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "Logistic Regression")

Logistic Regression Counts:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.7894
Logistic Regression Precision: 0.7918
Logistic Regression Recall: 0.7894
Logistic Regression F1 Score: 0.7887
              precision    recall  f1-score   support

           0       0.73      0.72      0.73       319
           1       0.66      0.75      0.71       389
           2       0.72      0.67      0.69       394
           3       0.67      0.68      0.68       392
           4       0.76      0.81      0.78       385
           5       0.81      0.69      0.75       395
           6       0.81      0.88      0.85       390
           7       0.81      0.84      0.83       396
           8       0.91      0.92      0.91       398
           9       0.84      0.88      0.86       397
          10       0.92      0.91      0.92       399
          11       0.89      0.87      0.88       396
          12       0.69      0.73      0.71       393
          13       0.82      0.71      0.76       396
          14       0.90      0.89      0.8

In [12]:
print("SVM Counts:")
CSVMAcc, CSVMPre, CSVMRe, CSVMF1 = classify_and_print_results(X_train_count, y_train_count, X_test_count, y_test_count, "SVM")
print("SVM TF-IDF:")
IDFSVMAcc, IDFSVMPre, IDFSVMRe, IDFSVMF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "SVM")

SVM Counts:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Accuracy: 0.1511
SVM Precision: 0.4219
SVM Recall: 0.1511
SVM F1 Score: 0.1372
              precision    recall  f1-score   support

           0       0.33      0.02      0.03       319
           1       0.05      0.17      0.08       389
           2       0.33      0.02      0.03       394
           3       0.63      0.03      0.06       392
           4       1.00      0.00      0.01       385
           5       0.64      0.05      0.09       395
           6       0.09      0.94      0.17       390
           7       0.39      0.10      0.15       396
           8       0.10      0.25      0.15       398
           9       0.52      0.11      0.18       397
          10       0.58      0.08      0.14       399
          11       0.41      0.16      0.23       396
          12       0.21      0.02      0.03       393
          13       0.28      0.10      0.15       396
          14       0.56      0.05      0.09       394
          15       0.42      0.37      0.39       39

In [13]:
print("MLP Counts:")
CMLPAcc, CMLPPre, CMLPRe, CMLPF1 = classify_and_print_results(X_train_count, y_train_count, X_test_count, y_test_count, "MLP")
print("MLP TF-IDF:")
IDFMLPAcc, IDFMLPPre, IDFMLPRe, IDFMLPF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "MLP")

MLP Counts:
MLP Accuracy: 0.8285
MLP Precision: 0.8326
MLP Recall: 0.8285
MLP F1 Score: 0.8284
              precision    recall  f1-score   support

           0       0.80      0.77      0.79       319
           1       0.70      0.80      0.75       389
           2       0.73      0.74      0.74       394
           3       0.65      0.76      0.70       392
           4       0.82      0.83      0.83       385
           5       0.88      0.73      0.80       395
           6       0.79      0.88      0.83       390
           7       0.90      0.90      0.90       396
           8       0.97      0.93      0.95       398
           9       0.93      0.94      0.93       397
          10       0.94      0.97      0.95       399
          11       0.92      0.91      0.91       396
          12       0.81      0.70      0.75       393
          13       0.92      0.77      0.84       396
          14       0.90      0.89      0.90       394
          15       0.83      0.93      0

In [14]:
data = {
    'NB': [TFNBAcc, IDFNBAcc, CNBAcc],
    'LR': [TFLRAcc, IDFLRAcc, CLRAcc],
    'SVM': [TFSVMAcc, IDFSVMAcc, CSVMAcc],
    'MLP': [TFMLPAcc, IDFMLPAcc, CMLPAcc]
}

# Create a DataFrame for Accuracy
accuracy_table = pd.DataFrame(data, index=['TF', 'IDF', 'C'])

# Create a dictionary for Precision
precision_data = {
    'NB': [TFNBPre, IDFNBPre, CNBPre],
    'LR': [TFLRPre, IDFLRPre, CLRPre],
    'SVM': [TFSVMPre, IDFSVMPre, CSVMPre],
    'MLP': [TFMLPPre, IDFMLPPre, CMLPPre]
}

# Create a DataFrame for Precision
precision_table = pd.DataFrame(precision_data, index=['TF', 'IDF', 'C'])

# Create a dictionary for Recall
recall_data = {
    'NB': [TFNBRe, IDFNBRe, CBNRe],
    'LR': [TFLRRe, IDFLRRe, CLRRe],
    'SVM': [TFSVMRe, IDFSVMRe, CSVMRe],
    'MLP': [TFMLPRe, IDFMLPRe, CMLPRe]
}

# Create a DataFrame for Recall
recall_table = pd.DataFrame(recall_data, index=['TF', 'IDF', 'C'])

# Create a dictionary for F1 Score
f1_data = {
    'NB': [TFNBF1, IDFNBF1, CNBF1],
    'LR': [TFLRF1, IDFLRF1, CLRF1],
    'SVM': [TFSVMF1, IDFSVMF1, CSVMF1],
    'MLP': [TFMLPF1, IDFMLPF1, CMLPF1]
}

# Create a DataFrame for F1 Score
f1_table = pd.DataFrame(f1_data, index=['TF', 'IDF', 'C'])


# Display the DataFrame
print("Accuracy Table:")
print(accuracy_table)
print("\nPrecision Table:")
print(precision_table)
print("\nRecall Table:")
print(recall_table)
print("\nF1 Score Table:")
print(f1_table)

Accuracy Table:
           NB        LR       SVM       MLP
TF   0.817313  0.827801  0.821030  0.847451
IDF  0.823818  0.840813  0.837228  0.858603
C    0.772836  0.789432  0.151089  0.828465

Precision Table:
           NB        LR       SVM       MLP
TF   0.830226  0.830501  0.832077  0.850299
IDF  0.837685  0.843504  0.847194  0.861315
C    0.761668  0.791764  0.421916  0.832577

Recall Table:
           NB        LR       SVM       MLP
TF   0.817313  0.827801  0.821030  0.847451
IDF  0.823818  0.840813  0.837228  0.858603
C    0.772836  0.789432  0.151089  0.828465

F1 Score Table:
           NB        LR       SVM       MLP
TF   0.811799  0.826237  0.821644  0.847185
IDF  0.817977  0.839117  0.837894  0.858213
C    0.751113  0.788702  0.137244  0.828368


## Question 4 Look up the documentation of the CountVectorizer function and experiment with different values for the following parameters for your best classifier-feature combination. For each of these parameters compare different values and store the results.
a. Lowercasing (true or false)
b. stop_words (with or without)
c. analyzer (in combination with ngram_range), try out a few values
d. max_features, try out a few values

In [15]:
# a. Lowercasing (true or false)
# Create an empty list to store results
results = []

# 1. Test Lowercasing
for lowercase in [True, False]:
    tfidf_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        max_df=0.5,
        min_df=5,
        lowercase=lowercase,
        stop_words="english",
        analyzer='word',
        ngram_range=(1, 1),
        max_features=None
    )
    
    X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, feature_names_tfidf, target_names_tfidf = load_dataset(tfidf_vectorizer, verbose=True)
    CMLPAcc, CMLPPre, CMLPRe, CMLPF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "MLP")
    
    results.append({
        'test': 'Lowercasing',
        'value': lowercase,
        'accuracy': CMLPAcc,
        'precision': CMLPPre,
        'recall': CMLPRe,
        'f1_score': CMLPF1
    })

11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.568s at 8.588MB/s
n_samples: 11314, n_features: 25631
Vectorize testing done in 1.293s at 10.672MB/s
n_samples: 7532, n_features: 25631
MLP Accuracy: 0.8586
MLP Precision: 0.8613
MLP Recall: 0.8586
MLP F1 Score: 0.8582
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       319
           1       0.75      0.83      0.79       389
           2       0.80      0.74      0.77       394
           3       0.68      0.78      0.73       392
           4       0.82      0.86      0.84       385
           5       0.87      0.80      0.83       395
           6       0.83      0.88      0.86       390
           7       0.92      0.91      0.91       396
           8       0.97      0.96      0.96       398
           9       0.94      0.96      0.95       397
          10       0.97      0.98      0.98       399
          11

In [16]:
# 2. Test Stop Words
for stop_words in [None, 'english']:
    tfidf_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        max_df=0.5,
        min_df=5,
        lowercase=True,
        stop_words=stop_words,
        analyzer='word',
        ngram_range=(1, 1),
        max_features=None
    )
    
    X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, feature_names_tfidf, target_names_tfidf = load_dataset(tfidf_vectorizer, verbose=True)
    CMLPAcc, CMLPPre, CMLPRe, CMLPF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "MLP")
    
    results.append({
        'test': 'Stop Words',
        'value': stop_words,
        'accuracy': CMLPAcc,
        'precision': CMLPPre,
        'recall': CMLPRe,
        'f1_score': CMLPF1
    })

11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.479s at 8.895MB/s
n_samples: 11314, n_features: 25914
Vectorize testing done in 1.455s at 9.486MB/s
n_samples: 7532, n_features: 25914
MLP Accuracy: 0.8540
MLP Precision: 0.8563
MLP Recall: 0.8540
MLP F1 Score: 0.8534
              precision    recall  f1-score   support

           0       0.84      0.80      0.82       319
           1       0.75      0.81      0.78       389
           2       0.75      0.75      0.75       394
           3       0.69      0.74      0.71       392
           4       0.83      0.85      0.84       385
           5       0.88      0.77      0.83       395
           6       0.81      0.89      0.85       390
           7       0.93      0.90      0.92       396
           8       0.96      0.96      0.96       398
           9       0.93      0.96      0.95       397
          10       0.97      0.98      0.97       399
          11 

In [17]:
# 3. Test Analyzer and Ngram Range
for analyzer in ['word', 'char']:
    for ngram_range in [(1, 1), (1, 2)]:
        tfidf_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            max_df=0.5,
            min_df=5,
            lowercase=True,
            stop_words='english',
            analyzer=analyzer,
            ngram_range=ngram_range,
            max_features=None
        )
        
        X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, feature_names_tfidf, target_names_tfidf = load_dataset(tfidf_vectorizer, verbose=True)
        CMLPAcc, CMLPPre, CMLPRe, CMLPF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "MLP")
        
        results.append({
            'test': 'Analyzer and Ngram Range',
            'analyzer': analyzer,
            'ngram_range': ngram_range,
            'accuracy': CMLPAcc,
            'precision': CMLPPre,
            'recall': CMLPRe,
            'f1_score': CMLPF1
        })

11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.310s at 9.549MB/s
n_samples: 11314, n_features: 25631
Vectorize testing done in 1.326s at 10.406MB/s
n_samples: 7532, n_features: 25631
MLP Accuracy: 0.8586
MLP Precision: 0.8613
MLP Recall: 0.8586
MLP F1 Score: 0.8582
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       319
           1       0.75      0.83      0.79       389
           2       0.80      0.74      0.77       394
           3       0.68      0.78      0.73       392
           4       0.82      0.86      0.84       385
           5       0.87      0.80      0.83       395
           6       0.83      0.88      0.86       390
           7       0.92      0.91      0.91       396
           8       0.97      0.96      0.96       398
           9       0.94      0.96      0.95       397
          10       0.97      0.98      0.98       399
          11



MLP Accuracy: 0.8602
MLP Precision: 0.8626
MLP Recall: 0.8602
MLP F1 Score: 0.8600
              precision    recall  f1-score   support

           0       0.89      0.78      0.83       319
           1       0.74      0.84      0.78       389
           2       0.77      0.78      0.78       394
           3       0.74      0.76      0.75       392
           4       0.82      0.84      0.83       385
           5       0.86      0.80      0.83       395
           6       0.80      0.90      0.85       390
           7       0.93      0.89      0.91       396
           8       0.95      0.95      0.95       398
           9       0.92      0.95      0.94       397
          10       0.97      0.97      0.97       399
          11       0.93      0.92      0.93       396
          12       0.79      0.78      0.79       393
          13       0.88      0.87      0.88       396
          14       0.91      0.91      0.91       394
          15       0.88      0.93      0.90       39



11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.979s at 7.403MB/s
n_samples: 11314, n_features: 28
Vectorize testing done in 3.032s at 4.551MB/s
n_samples: 7532, n_features: 28


In [9]:
# 4. Test Max Features
for max_features in [None, 1000, 5000]:
    tfidf_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        max_df=0.5,
        min_df=5,
        lowercase=True,
        stop_words='english',
        analyzer='word',
        ngram_range=(1, 1),
        max_features=max_features
    )
    
    X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, feature_names_tfidf, target_names_tfidf = load_dataset(tfidf_vectorizer, verbose=True)
    CMLPAcc, CMLPPre, CMLPRe, CMLPF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "MLP")
    
    results.append({
        'test': 'Max Features',
        'value': max_features,
        'accuracy': CMLPAcc,
        'precision': CMLPPre,
        'recall': CMLPRe,
        'f1_score': CMLPF1
    })

11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 2.739s at 8.052MB/s
n_samples: 11314, n_features: 25631
Vectorize testing done in 1.636s at 8.433MB/s
n_samples: 7532, n_features: 25631
MLP Accuracy: 0.8586
MLP Precision: 0.8613
MLP Recall: 0.8586
MLP F1 Score: 0.8582
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       319
           1       0.75      0.83      0.79       389
           2       0.80      0.74      0.77       394
           3       0.68      0.78      0.73       392
           4       0.82      0.86      0.84       385
           5       0.87      0.80      0.83       395
           6       0.83      0.88      0.86       390
           7       0.92      0.91      0.91       396
           8       0.97      0.96      0.96       398
           9       0.94      0.96      0.95       397
          10       0.97      0.98      0.98       399
          11 

NameError: name 'pd' is not defined

In [10]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Display the DataFrame
print(results_df)

                        test    value  accuracy  precision    recall  \
0                Lowercasing     True  0.858603   0.861315  0.858603   
1                Lowercasing    False  0.858471   0.861084  0.858471   
2                 Stop Words     None  0.853956   0.856261  0.853956   
3                 Stop Words  english  0.858603   0.861315  0.858603   
4   Analyzer and Ngram Range      NaN  0.858603   0.861315  0.858603   
5   Analyzer and Ngram Range      NaN  0.860860   0.863019  0.860860   
6   Analyzer and Ngram Range      NaN  0.159984   0.154684  0.159984   
7   Analyzer and Ngram Range      NaN  0.637812   0.642718  0.637812   
8               Max Features     None  0.858603   0.861315  0.858603   
9               Max Features     1000  0.646973   0.651936  0.646973   
10              Max Features     5000  0.809878   0.813018  0.809878   

    f1_score analyzer ngram_range  
0   0.858213      NaN         NaN  
1   0.858194      NaN         NaN  
2   0.853414      NaN      

In [5]:
# Optimal combination
tfidf_vectorizer = TfidfVectorizer(
    sublinear_tf=True, 
    max_df=0.5, 
    min_df=5, 
    stop_words="english", 
    ngram_range=(1, 2),
    lowercase=True
)

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf, feature_names_tfidf, target_names_tfidf = load_dataset(tfidf_vectorizer, verbose=True)
CMLPAcc, CMLPPre, CMLPRe, CMLPF1 = classify_and_print_results(X_train_tfidf, y_train_tfidf, X_test_tfidf, y_test_tfidf, "MLP")


11314 documents - 22.05MB (training set)
7532 documents - 13.80MB (test set)
20 categories
Vectorize training done in 5.090s at 4.333MB/s
n_samples: 11314, n_features: 64194
Vectorize testing done in 2.197s at 6.282MB/s
n_samples: 7532, n_features: 64194
MLP Accuracy: 0.8609
MLP Precision: 0.8630
MLP Recall: 0.8609
MLP F1 Score: 0.8606
              precision    recall  f1-score   support

           0       0.88      0.79      0.83       319
           1       0.75      0.82      0.78       389
           2       0.78      0.77      0.78       394
           3       0.73      0.76      0.74       392
           4       0.82      0.84      0.83       385
           5       0.87      0.81      0.83       395
           6       0.82      0.89      0.86       390
           7       0.92      0.90      0.91       396
           8       0.95      0.96      0.96       398
           9       0.93      0.95      0.94       397
          10       0.97      0.97      0.97       399
          11 

In [6]:
print(CMLPAcc, CMLPPre, CMLPRe, CMLPF1)

0.8608603292618162 0.8630192877572318 0.8608603292618162 0.8606036212727489
