## Import necessary modules

In [6]:
!pip install --upgrade pip
!pip install gensim
!pip install pandas==2.2.2
!pip install tqdm
!pip install scipy==1.10

Collecting scipy==1.10
  Downloading scipy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m971.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading scipy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.1/34.1 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.1
    Uninstalling scipy-1.11.1:
      Successfully uninstalled scipy-1.11.1
Successfully installed scipy-1.10.0


In [1]:
import numpy as np

import gensim.downloader as api
from gensim.models import KeyedVectors

from utils import DatasetReader

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import classification_report

from tqdm.notebook import tqdm

## Load embeddings

In [3]:
embeddings = api.load('word2vec-google-news-300')
embeddings.save('weights/word2vec.kv')  
# 
# embeddings: KeyedVectors = KeyedVectors.load('weights/word2vec.kv')

In this notebook, we utilize Word2Vec embeddings from Gensim's `word2vec-google-news-300`.

## Load dataset

In [6]:
reader = DatasetReader()
data = reader.read_dirs('../data')
data

Unnamed: 0,Questions,Topic
0,define the term brand,marketing mix and strategy
1,explain one risk jack ma may have taken when s...,entreprenuers and leaders
2,analyse two factors that may have increased de...,market
3,discuss if profit maximisation is the main bus...,entreprenuers and leaders
4,assess the advantages of a paternalistic style...,managing people
...,...,...
2810,true or false: do entrepreneurial motivations ...,entreprenuers and leaders
2811,how do the approaches of a long-term planner v...,entreprenuers and leaders
2812,what distinguishes an economist's long-term ap...,entreprenuers and leaders
2813,what are the key distinctions between individu...,entreprenuers and leaders


## Data preprocessing

In [7]:
vectors = data.to_numpy()
vectors.shape

(2815, 2)

In [8]:
encoder = LabelEncoder().fit(vectors[:, 1])

encoded = np.stack([vectors[:, 0], encoder.transform(vectors[:, 1])], axis=1)
encoded[0]

array(['define the term brand', 3], dtype=object)

In [9]:
def process_sentence(sentence, embedder):
    words = sentence.split(' ')
    res = []
    for w in words:
        if w in embedder:
            res.append(embedder.get_vector(w))
    return np.array(res).mean(axis=0)

In [10]:
X = []
y = []

for sample in encoded:
    X.append(process_sentence(sample[0], embeddings))
    y.append(sample[1])

X, y = np.array(X), np.array(y)
X.shape, y.shape

((2815, 300), (2815,))

## Splitting the data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2252, 300), (563, 300), (2252,), (563,))

Augmented dataset.

## Training the model

In [12]:
svc = SVC()
svc.fit(X_train, y_train)

As the base model, we take the Support Vector Classifier.

## Evaluating the model

In [13]:
print(classification_report(y_test, svc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       103
           1       0.83      0.87      0.85        98
           2       0.88      0.85      0.87       103
           3       0.77      0.83      0.80       133
           4       0.83      0.74      0.78       126

    accuracy                           0.82       563
   macro avg       0.82      0.82      0.82       563
weighted avg       0.82      0.82      0.82       563



The support vector classifier achieved macro and micro average F1 scores of 82% on Word2Vec embeddings, which is a good result but not as impressive as the 87% achieved by the multinomial Naive Bayes model.