## Import necessary modules

In [6]:
!pip install --upgrade pip
!pip install gensim
!pip install pandas==2.2.2
!pip install tqdm
!pip install scipy==1.10

Collecting scipy==1.10
  Downloading scipy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m971.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Downloading scipy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.1/34.1 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.1
    Uninstalling scipy-1.11.1:
      Successfully uninstalled scipy-1.11.1
Successfully installed scipy-1.10.0


In [1]:
import numpy as np

import gensim.downloader as api
from gensim.models import KeyedVectors

from utils import DatasetReader

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import classification_report

from tqdm.notebook import tqdm

## Load embeddings

In [2]:
# embeddings = api.load('word2vec-google-news-300')
# embeddings.save('weights/word2vec.kv')  

embeddings: KeyedVectors = KeyedVectors.load('weights/word2vec.kv')

## Load dataset

In [3]:
reader = DatasetReader()
data = reader.read_from_file('../data/csv/clean_data.csv')
data

Unnamed: 0,Questions,Topic
0,define the term brand,marketing_mix_and_strategy
1,explain one risk jack ma may have taken when s...,entreprenuers_and_leaders
2,analyse two factors that may have increased de...,market
3,discuss if profit maximisation is the main bus...,entreprenuers_and_leaders
4,assess the advantages of a paternalistic style...,managing_people
...,...,...
304,evaluate the likely value of each of the follo...,meeting_customer_needs
307,evaluate the likely value to mike watson of us...,meeting_customer_needs
310,briefly explain two reasons why levi roots con...,meeting_customer_needs
313,assess the likely implications for reggae regg...,meeting_customer_needs


## Data preprocessing

In [4]:
vectors = data.to_numpy()
vectors.shape

(264, 2)

In [7]:
encoder = LabelEncoder().fit(vectors[:, 1])

encoded = np.stack([vectors[:, 0], encoder.transform(vectors[:, 1])], axis=1)
encoded[0]

array(['define the term brand', 3], dtype=object)

In [8]:
def process_sentence(sentence, embedder):
    words = sentence.split(' ')
    res = []
    for w in words:
        if w in embedder:
            res.append(embedder.get_vector(w))
    return np.array(res).mean(axis=0)

In [9]:
X = []
y = []

for sample in encoded:
    X.append(process_sentence(sample[0], embeddings))
    y.append(sample[1])

X, y = np.array(X), np.array(y)
X.shape, y.shape

((264, 300), (264,))

## Splitting the data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((211, 300), (53, 300), (211,), (53,))

## Training the model

In [11]:
svc = SVC()
svc.fit(X_train, y_train)

## Evaluating the model

In [12]:
print(classification_report(y_test, svc.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         8
           1       0.86      0.86      0.86         7
           2       1.00      0.75      0.86         8
           3       0.67      0.86      0.75        14
           4       0.61      0.69      0.65        16

    accuracy                           0.74        53
   macro avg       0.83      0.73      0.76        53
weighted avg       0.78      0.74      0.74        53

