<a href="https://colab.research.google.com/github/Lauralug0/GBC/blob/main/LauraLugo_Task0_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, TfidfVectorizer) in classification of text documents

### Import all the necessary libraries

In [1]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [2]:
from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import pandas as pd
from gensim.models import Word2Vec
import numpy as np

### Choose a few categories fro the entire 20 categories

In [3]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [4]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

data_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
data_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

print(f"{len(data_train.data)} documentos de entrenamiento")
print(f"{len(data_train.target_names)} categorías")



Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']
857 documentos de entrenamiento
2 categorías


### Fetch documents for these 2 categories

In [5]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



### Define a pipeline combining a text feature extractor with a simple classifier

In [6]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(tol=1e-3)),
])

### Specify parameter grid
- 'vect__max_df': (0.5, 0.75, 1.0)
- 'vect__max_features': (None, 5000, 10000, 50000)
- 'vect__ngram_range': ((1, 1), (1, 2))
- 'tfidf__use_idf': (True, False)
- 'tfidf__norm': ('l1', 'l2')
- 'clf__max_iter': (20,)
- 'clf__alpha': (0.00001, 0.000001)
- 'clf__penalty': ('l2', 'elasticnet')
- 'clf__max_iter': (10, 50, 80)

In [7]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

### Find the best parameters for both the feature extraction and the classifier

### Build a GridSearch with the pipeline and parameter grid

In [8]:
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                           n_jobs=-1, verbose=1)








### Start the grid search

In [9]:
print("Starting Grid Search...")
t0 = time()
grid_search.fit(data_train.data, data_train.target)

Starting Grid Search...
Fitting 5 folds for each of 24 candidates, totalling 120 fits


### Best Score

In [10]:
print("Best score: %0.3f" % grid_search.best_score_)

Best score: 0.796


### Best Parameter

In [11]:
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best parameters set:
	clf__alpha: 1e-06
	clf__max_iter: 20
	clf__penalty: 'elasticnet'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)


### Test

In [12]:
y_pred = grid_search.predict(data_test.data)
acc_test = accuracy_score(data_test.target, y_pred)
print(f"Accuracy: {acc_test}")


Accuracy: 0.6894736842105263


### Use the model to classify a piece of text

In [13]:
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': LinearSVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'SGD': SGDClassifier()
}

# Feature extractors
vectorizers = {
    'CountVectorizer': CountVectorizer(stop_words='english'),
    'TfidfVectorizer': TfidfVectorizer(stop_words='english')
}

# List to store results
results = []

# Train and test each combination
for vect_name, vect in vectorizers.items():
    X_train_vect = vect.fit_transform(data_train.data)
    X_test_vect = vect.transform(data_test.data)

    for model_name, model in models.items():
        model.fit(X_train_vect, data_train.target)
        predictions = model.predict(X_test_vect)
        acc = accuracy_score(data_test.target, predictions)

        results.append({
            'Feature Extractor': vect_name,
            'Model': model_name,
            'Accuracy': round(acc, 4)
        })

# Display results in a sorted table
table = pd.DataFrame(results)
table = table.sort_values(by='Accuracy', ascending=False)

print("\n=== Final Benchmark Results ===")
print(table)


=== Final Benchmark Results ===
  Feature Extractor                Model  Accuracy
0   CountVectorizer          Naive Bayes    0.7018
7   TfidfVectorizer                  SVM    0.7018
9   TfidfVectorizer                  SGD    0.6912
6   TfidfVectorizer  Logistic Regression    0.6807
1   CountVectorizer  Logistic Regression    0.6789
5   TfidfVectorizer          Naive Bayes    0.6509
4   CountVectorizer                  SGD    0.6491
2   CountVectorizer                  SVM    0.6421
3   CountVectorizer        Decision Tree    0.6105
8   TfidfVectorizer        Decision Tree    0.5877


W2v

In [14]:
sentences = [text.split() for text in data_train.data]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2)

# Get average word vectors per document
def document_vector(doc):
    words = [w for w in doc.split() if w in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(100)
    return np.mean(w2v_model.wv[words], axis=0)

X_train_w2v = np.array([document_vector(doc) for doc in data_train.data])
X_test_w2v = np.array([document_vector(doc) for doc in data_test.data])

Saving Results

In [15]:
table.to_csv("LauraLugo_Task0_Text_Classification.txt", sep='\t', index=False)
print("Saved!")


Saved!
