In [295]:
# from google.colab import drive
# drive.mount('/content/drive')

In [296]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [297]:
# Load the preprocessed data
df_training = pd.read_csv("./preprocessed_training_data.csv")
df_testing = pd.read_csv("./preprocessed_testing_data.csv")

In [298]:
df_training

Unnamed: 0,Text,Category
0,argentin 198687 grainoilse registr argentin gr...,wheat
1,champion product ch approv stock split champio...,earn
2,comput termin system cpml complet sale comput ...,acq
3,cobanco inc cbco year net shr 34 ct v 119 dlr ...,earn
4,ohio mattress omt may lower 1st qtr net ohio m...,acq
...,...,...
6144,poehl warn dollar fall bundesbank presid karl ...,money-supply
6145,bank japan buy dollar shortli open around 1453...,money-fx
6146,bank japan interven soon tokyo open bank japan...,money-fx
6147,south korean fix 25month high bank korea said ...,money-fx


In [299]:
df_testing

Unnamed: 0,Text,Category
0,asian export fear damag usjapan rift mount tra...,trade
1,australian foreign ship ban end nsw port hit t...,ship
2,sri lanka get usda approv wheat price food dep...,wheat
3,sumitomo bank aim quick recoveri merger sumito...,acq
4,bundesbank alloc 61 billion mark tender bundes...,interest
...,...,...
2405,philippin trade gap widen januaryaugust philip...,trade
2406,new zealand impos sanction fiji new zealand im...,sugar
2407,iran soviet union swap crude refin product sov...,crude
2408,nz chase corp make offer entregrowth chase cor...,acq


In [300]:
text_data_training = df_training["Text"].tolist()
text_data_testing = df_testing["Text"].tolist()

In [301]:
text_data_training

['argentin 198687 grainoilse registr argentin grain board figur show crop registr grain oilse product februari 11 thousand tonn show futur shipment month 198687 total 198586 total februari 12 1986 bracket bread wheat prev 16558 feb 8720 march 1646 total 26924 41610 maiz mar 480 total 480 nil sorghum nil nil oilse export registr sunflowerse total 150 79 soybean may 200 total 200 nil board also detail export registr subproduct follow subproduct wheat prev 399 feb 487 march 132 apr 100 total 1118 827 linse prev 348 feb 329 mar 68 apr 63 total 808 874 soybean prev 1009 feb 451 mar nil apr nil may 200 total 1661 2185 sunflowerse prev 486 feb 615 mar 251 apr 145 total 1498 1453 veget oil registr sunoil prev 374 feb 1073 mar 245 apr 32 may nil jun 100 total 1824 1176 linoil prev 159 feb 236 mar 204 apr 20 total 618 761 soybean oil prev 37 feb 211 mar nil apr 20 may 90 jun 130 jul 70 total 558 337 reuter',
 'champion product ch approv stock split champion product inc said board director approv

In [302]:
text_data_testing

['asian export fear damag usjapan rift mount trade friction u japan rais fear among mani asia export nation row could inflict farreach econom damag businessmen offici said told reuter correspond asian capit u move japan might boost protectionist sentiment u lead curb american import product export said conflict would hurt longrun shortterm tokyo loss might gain u said impos 300 mln dlr tariff import japanes electron good april 17 retali japan alleg failur stick pact sell semiconductor world market cost unoffici japanes estim put impact tariff 10 billion dlr spokesman major electron firm said would virtual halt export product hit new tax wouldnt abl busi said spokesman lead japanes electron firm matsushita electr industri co ltd mct tariff remain place length time beyond month mean complet eros export good subject tariff u said tom murtha stock analyst tokyo offic broker jame capel co taiwan businessmen offici also worri awar serious u threat japan serv warn u said senior taiwanes trade

In [303]:
df_training.head()

Unnamed: 0,Text,Category
0,argentin 198687 grainoilse registr argentin gr...,wheat
1,champion product ch approv stock split champio...,earn
2,comput termin system cpml complet sale comput ...,acq
3,cobanco inc cbco year net shr 34 ct v 119 dlr ...,earn
4,ohio mattress omt may lower 1st qtr net ohio m...,acq


In [304]:
df_testing.head()

Unnamed: 0,Text,Category
0,asian export fear damag usjapan rift mount tra...,trade
1,australian foreign ship ban end nsw port hit t...,ship
2,sri lanka get usda approv wheat price food dep...,wheat
3,sumitomo bank aim quick recoveri merger sumito...,acq
4,bundesbank alloc 61 billion mark tender bundes...,interest


In [305]:
# Count Vectorization
def count_vectorization(data, max_features=1000):
    vectorizer = CountVectorizer(max_features=max_features)
    X_counts = vectorizer.fit_transform(data)
    return X_counts, vectorizer.get_feature_names_out()

In [306]:
# TF-IDF Vectorization
def tfidf_vectorization(data, max_features=1000):
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_tfidf = vectorizer.fit_transform(data)
    return X_tfidf, vectorizer.get_feature_names_out()

In [307]:
# Word2vec Vectorization
def word2vec_vectorization(data, embedding_dim=100):

    sentences = [doc.split() for doc in data]

    model = Word2Vec(sentences=sentences, vector_size=embedding_dim, min_count=1)

    word_vectors = []
    for doc in sentences:
        doc_vector = np.zeros(embedding_dim)
        word_count = 0
        for word in doc:
            if word in model.wv:
                doc_vector += model.wv[word]
                word_count += 1
        if word_count > 0:
            doc_vector /= word_count
        word_vectors.append(doc_vector)

    return np.array(word_vectors)

In [308]:
# Perform vectorizations
X_counts_train, feature_names_counts_train = count_vectorization(text_data_training)
X_tfidf_train, feature_names_tfidf_train = tfidf_vectorization(text_data_training)
X_word2vec_train = word2vec_vectorization(text_data_training)

In [309]:
# Perform vectorizations
X_counts_test, feature_names_counts_test = count_vectorization(text_data_testing)
X_tfidf_test, feature_names_tfidf_test = tfidf_vectorization(text_data_testing)
X_word2vec_test = word2vec_vectorization(text_data_testing)

In [310]:
# Save the vectorized data into Numpy files
np.save("./X_counts_train.npy", X_counts_train)
np.save("./X_tfidf_train.npy", X_tfidf_train)
np.save("./X_word2vec_train.npy", X_word2vec_train)

In [311]:
# Save the vectorized data into Numpy files
np.save("./X_counts_test.npy", X_counts_test)
np.save("./X_tfidf_test.npy", X_tfidf_test)
np.save("./X_word2vec_test.npy", X_word2vec_test)

In [312]:
np.save("./feature_names_counts_train.npy", feature_names_counts_train)
np.save("./feature_names_tfidf_train.npy", feature_names_tfidf_train)

In [313]:
# Save the feature names into Numpy files
np.save("./feature_names_counts_test.npy", feature_names_counts_test)
np.save("./feature_names_tfidf_test.npy", feature_names_tfidf_test)

In [314]:
# Extract labels and save the concatenated text to NumPy and CSV
y_train = df_training["Category"].tolist()
np.save("./y_train.npy", y_train)
df_training[["Text", "Category"]].to_csv("./text_and_categories_train.csv", index=False)

In [315]:
y_test = df_testing["Category"].tolist()
np.save("./y_test.npy", y_test)
df_testing[["Text", "Category"]].to_csv("./text_and_categories_test.csv", index=False)