In [None]:
import os
import re
import pandas as pd
import numpy as np
import sklearn.model_selection
from gensim.models import Word2Vec

random_state = 42

In [None]:
df = pd.read_json(os.path.join(os.getcwd(), '../input/nlp-itmo-exercise-1/archive/train_5.json'))
df

In [None]:
labels_to_id = {v: k for k, v in enumerate(df)}
id_to_labels = {v: k for k, v in labels_to_id.items()}

In [None]:
def clean_text(text):
    s = text.lower()
    s = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", s)
    s = re.sub("\s+", " ", s)
    s = s.strip()
    return s

In [None]:
y = []
texts = []
for k, v in labels_to_id.items():
    for text in df[k]['texts']:
        texts.append(text)
        y.append(v)
        
texts = [clean_text(text) for text in texts]

In [None]:
ddf = pd.DataFrame()
ddf['texts'] = texts
ddf['labels'] = y
ddf = ddf.sample(frac=1, random_state=random_state)

In [None]:
w2v_model = Word2Vec(sentences=[r.split() for r in ddf['texts']], vector_size=100, window=5, min_count=1, workers=os.cpu_count())

In [None]:
features = []

for r in texts:
    vectors = []
    
    for w in r:
        if w in w2v_model.wv:
            v = w2v_model.wv[w]
            vectors.append(v)
            
    vectors = np.array(vectors)
    feature = np.average(vectors, axis=0)
    features.append(feature)

features = np.array(features)

In [None]:
labels = list(ddf['labels'])

In [None]:
train_features, val_features, train_labels, val_labels = sklearn.model_selection.train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=random_state)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
svm_cls = SVC(kernel='rbf', gamma=0.5, C=0.1)
svm_cls.fit(train_features, train_labels)

In [None]:
preds = svm_cls.predict(val_features)
accuracy_score(val_labels, preds)

**Submission**

In [None]:
test_df = pd.read_csv(os.path.join(os.getcwd(), '../input/nlp-itmo-exercise-1/archive/test.csv'))

In [None]:
test_texts = [clean_text(text) for text in test_df['text']]

test_features = []
for r in test_texts:
    vectors = []
    
    for w in r:
        if w in w2v_model.wv:
            v = w2v_model.wv[w]
            vectors.append(v)
            
    vectors = np.array(vectors)
    feature = np.average(vectors, axis=0)
    test_features.append(feature)

test_features = np.array(test_features)

In [None]:
test_preds = svm_cls.predict(test_features)

In [None]:
ids = []
test_labels = []
for i, v in enumerate(test_preds):
    ids.append(i)
    test_labels.append(''.join(id_to_labels[v].strip().split()))

In [None]:
submission = pd.DataFrame()
submission['Id'] = ids
submission['Category'] = test_labels

submission.to_csv('submission.csv', index=False)