In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [3]:
# Load the preprocessed data
df = pd.read_csv("/content/drive/MyDrive/NLPCoursework/Task2/preprocessed_data.csv")

In [4]:
df["Combined_Text"] = df["Topic"] + " " + df["Text"]

In [5]:
text_data = df["Combined_Text"].tolist()

In [6]:
# Count Vectorization
def count_vectorization(data, max_features=1000):
    vectorizer = CountVectorizer(max_features=max_features)
    X_counts = vectorizer.fit_transform(data)
    return X_counts, vectorizer.get_feature_names_out()

In [7]:
# TF-IDF Vectorization
def tfidf_vectorization(data, max_features=1000):
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_tfidf = vectorizer.fit_transform(data)
    return X_tfidf, vectorizer.get_feature_names_out()

In [8]:
# Word2vec Vectorization
def word2vec_vectorization(data, embedding_dim=100):

    sentences = [doc.split() for doc in data]

    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, min_count=1)

    word_vectors = []
    for doc in sentences:
        doc_vector = np.zeros(embedding_dim)
        word_count = 0
        for word in doc:
            if word in model.wv:
                doc_vector += model.wv[word]
                word_count += 1
        if word_count > 0:
            doc_vector /= word_count
        word_vectors.append(doc_vector)

    return np.array(word_vectors)

In [9]:
# Perform vectorizations
X_counts, feature_names_counts = count_vectorization(text_data)
X_tfidf, feature_names_tfidf = tfidf_vectorization(text_data)
X_word2vec = word2vec_vectorization(text_data)

In [10]:
# Save the vectorized data into Numpy files
np.save("/content/drive/MyDrive/NLPCoursework/Task2/X_counts.npy", X_counts)
np.save("/content/drive/MyDrive/NLPCoursework/Task2/X_tfidf.npy", X_tfidf)
np.save("/content/drive/MyDrive/NLPCoursework/Task2/X_word2vec.npy", X_word2vec)

In [11]:
# Save the feature names into Numpy files
np.save("/content/drive/MyDrive/NLPCoursework/Task2/feature_names_counts.npy", feature_names_counts)
np.save("/content/drive/MyDrive/NLPCoursework/Task2/feature_names_tfidf.npy", feature_names_tfidf)

In [12]:
# Extract labels and save the concatenated text to NumPy and CSV
y = df["Category"].tolist()
np.save("/content/drive/MyDrive/NLPCoursework/Task2/y.npy", y)
df[["Combined_Text", "Category"]].to_csv("/content/drive/MyDrive/NLPCoursework/Task2/text_and_categories.csv", index=False)