## Import necessary modules

In [1]:
!pip install --upgrade pip
!pip install gensim
!pip install pandas==2.2.2
!pip install tqdm
!pip install scipy==1.10



In [2]:
import numpy as np

import gensim.downloader as api
from gensim.models import KeyedVectors

from utils import DatasetReader

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import classification_report
from time import time 

## Load embeddings

In [3]:
# embeddings = api.load('word2vec-google-news-300')
# embeddings.save('weights/word2vec.kv')  

embeddings: KeyedVectors = KeyedVectors.load('weights/word2vec.kv')

## Load and split dataset

In [4]:
reader = DatasetReader(encode_labels=True)
train = reader.read_dirs('../data/augmentation')
test = reader.read_dirs('../data/csv')

train

Unnamed: 0,Questions,Topic
0,"what is the essence of a brand, and how does i...",3
1,what is the concept of a brand that distinguis...,3
2,"what is meant by a brand, and how does it dist...",3
3,"what is a brand, and how does it differ from a...",3
4,what was a crucial gamble jack ma took in esta...,0
...,...,...
2247,true or false: do entrepreneurial motivations ...,0
2248,how do the approaches of a long-term planner v...,0
2249,what distinguishes an economist's long-term ap...,0
2250,what are the key distinctions between individu...,0


In [5]:
# Splitting the dataset into training and testing sets
X_train, y_train = np.array(train['Questions']), np.array(train['Topic'])
X_test, y_test = np.array(test['Questions']), np.array(test['Topic'])

## Data preprocessing

In [6]:
def process_sentence(sentence, embedder):
    words = sentence.split(' ')
    res = []
    for w in words:
        if w in embedder:
            res.append(embedder.get_vector(w))
    return np.array(res).mean(axis=0)

In [7]:
X_train_preprocessed = np.array([process_sentence(sentence, embeddings) for sentence in X_train])
X_test_preprocessed = np.array([process_sentence(sentence, embeddings) for sentence in X_test])

X_train_preprocessed.shape, X_test_preprocessed.shape

((2252, 300), (563, 300))

## Training the model

In [8]:
svc = SVC()
svc.fit(X_train_preprocessed, y_train)

## Evaluating the model

In [9]:
print("Classification Report:\n", classification_report(y_test, svc.predict(X_test_preprocessed)))

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.84      0.63       103
           1       0.98      0.55      0.71        98
           2       0.97      0.70      0.81       103
           3       0.77      0.62      0.69       133
           4       0.63      0.79      0.70       126

    accuracy                           0.70       563
   macro avg       0.77      0.70      0.71       563
weighted avg       0.77      0.70      0.71       563



## Summary

### Inference time

In [10]:
all_data = np.concatenate((X_train_preprocessed, X_test_preprocessed), axis=0)
for _ in range(4): 
    all_data = np.concatenate((all_data, all_data), axis=0)

print('N samples:\t\t', all_data.shape)

start = time()
svc.predict(all_data)
end = time()

print(f'Inference time: {(end - start) / all_data.shape[0] * 1000:0.9f} [ms/sample]')
print(f'Inference time: {all_data.shape[0] / (end - start):9.4f} [samples/sec]')

N samples:		 (45040, 300)
Inference time: 0.268102391 [ms/sample]
Inference time: 3729.9182 [samples/sec]


### Metrics

In [11]:
print("Classification Report:\n", classification_report(y_test, svc.predict(X_test_preprocessed)))

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.84      0.63       103
           1       0.98      0.55      0.71        98
           2       0.97      0.70      0.81       103
           3       0.77      0.62      0.69       133
           4       0.63      0.79      0.70       126

    accuracy                           0.70       563
   macro avg       0.77      0.70      0.71       563
weighted avg       0.77      0.70      0.71       563

