# Task 1: Text Classification
## Syed Hamza Ali

### Import all the necessary libraries

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from sklearn.model_selection import GridSearchCV
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import numpy as np

### Tried to use all of the categories but it took too long so, Just ended up using a couple of categories

### Fetch Data

In [2]:
data = fetch_20newsgroups(subset='all')
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
# Fetch data
categories = ['alt.atheism', 'talk.religion.misc', 'rec.motorcycles', 'sci.med']
data = fetch_20newsgroups(subset='train', categories=categories)

# Preprocess and tokenize text data
tokenized_data = [simple_preprocess(text) for text in data.data]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(tokenized_data, data.target, test_size=0.2, random_state=2)

## Word2Vec Model

In [4]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

# Function to transform documents to Word2Vec vectors
def document_vector(model, doc):
    """
    Transform a list of words into a feature vector by averaging the word vectors.
    """
    # Initialize an array for storing the vectors
    doc_vector = np.zeros((model.vector_size,), dtype="float32")
    num_words = 0
    # Iterate over each word in the document
    for word in doc:
        if word in model.wv:
            doc_vector += model.wv[word]
            num_words += 1
    # Average the vectors to get the document vector
    if num_words != 0:
        doc_vector /= num_words
    return doc_vector

In [5]:
# Transform train and test data into numerical vectors using Word2Vec model
X_train_w2v = np.array([document_vector(word2vec_model, doc) for doc in X_train])
X_test_w2v = np.array([document_vector(word2vec_model, doc) for doc in X_test])

In [6]:
# Algorithms for Word2Vec model
# NOTE: The MNB algorithm had issues with the Word2Vec and Doc2Vec embedding so it was removed
algorithms = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machines': SVC(),
    'Decision Trees': DecisionTreeClassifier()
}

### Iterating Through each Algorithm with Word2Vec

In [7]:
# Initialize results dictionary
results = {}

# Fit and evaluate each algorithm on Word2Vec embeddings
for algorithm_name, algorithm in algorithms.items():
    # Train the algorithm
    algorithm.fit(X_train_w2v, y_train)
    # Predict using the trained model
    y_pred_w2v = algorithm.predict(X_test_w2v)
    # Calculate accuracy
    accuracy_w2v = accuracy_score(y_test, y_pred_w2v)
    # Store results
    results[(algorithm_name, 'Word2Vec')] = accuracy_w2v

## Doc2Vec

In [8]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(tokenized_data, data.target, test_size=0.2, random_state=42)

# Tag the documents with unique identifiers
tagged_train_docs = [TaggedDocument(words=text, tags=[str(i)]) for i, text in enumerate(X_train)]
tagged_test_docs = [TaggedDocument(words=text, tags=[str(i)]) for i, text in enumerate(X_test)]

In [9]:
# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_train_docs)
doc2vec_model.train(tagged_train_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

In [10]:
# Infer document vectors for training and test data
X_train_d2v = np.array([doc2vec_model.infer_vector(doc.words) for doc in tagged_train_docs])
X_test_d2v = np.array([doc2vec_model.infer_vector(doc.words) for doc in tagged_test_docs])

In [11]:
# Define algorithms
algorithms = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machines': SVC(),
    'Decision Trees': DecisionTreeClassifier()
}

In [12]:
# Fit and evaluate each algorithm on Doc2Vec embeddings
for algorithm_name, algorithm in algorithms.items():
    # Train the algorithm
    algorithm.fit(X_train_d2v, y_train)
    # Predict using the trained model
    y_pred = algorithm.predict(X_test_d2v)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    # Store results
    results[(algorithm_name, 'Doc2Vec')] = accuracy

## Count Vectorizer

### Count Vectorizer Train and Test Data

In [13]:
data = fetch_20newsgroups(subset='train', categories=categories)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=2)

# Initialize and fit CountVectorizer
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [14]:
# Initialize algorithms
algorithms = {
    'Multinomial Naïve Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machines': SVC(),
    'Decision Trees': DecisionTreeClassifier()
}

In [15]:
# Fit and evaluate each algorithm on count vectors
for algorithm_name, algorithm in algorithms.items():
    # Train the algorithm
    algorithm.fit(X_train_counts, y_train)
    # Predict using the trained model
    y_pred_counts = algorithm.predict(X_test_counts)
    # Calculate accuracy
    accuracy_counts = accuracy_score(y_test, y_pred_counts)
    # Store results
    results[(algorithm_name, 'CountVectorizer')] = accuracy_counts

### Writing the results to a text file in tabular format

In [16]:
with open("Syed_Hamza_Task1_Text_Classification.txt", "w") as file:
    file.write("Algorithm\tFeature Extractor\tAccuracy\n")
    for configuration, accuracy in results.items():
        file.write(f"{configuration[0]}\t{configuration[1]}\t{accuracy}\n")

### Best Configuration

In [17]:
# Choose the best algorithm-feature extractor configuration based on accuracy
best_configuration = max(results, key=results.get)
best_accuracy = results[best_configuration]

# Print best configuration
print("Best performing configuration:")
print("Algorithm:", best_configuration[0])
print("Feature Extractor:", best_configuration[1])
print("Accuracy:", best_accuracy)

Best performing configuration:
Algorithm: Multinomial Naïve Bayes
Feature Extractor: CountVectorizer
Accuracy: 0.9707317073170731
