In [None]:
import numpy as np
import pandas as pd

### Train, test and validation datasets

In [None]:
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

In [None]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')

In [None]:
X_val = pd.read_csv('X_val.csv')
y_val = pd.read_csv('y_val.csv')

In [None]:
X = pd.concat([X_test, X_train, X_val])

### Text preprocessing

In [None]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Word2Vec
import string

In [None]:
from pymystem3 import Mystem

In [None]:
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer, PorterStemmer

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
def convert_to_lowercase(text):
    return text.lower()

# function to remove punctuations from the text
def remove_punctuations(text):
    text = re.sub(r"""[\[\]«»#$%&\"'()*+,-./:;<=>?@\\^_`{|}~]""", '', text)
    return text.replace('\n', '')
    
# function to remove stopwords from the text
def remove_stopwords(text):
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('russian'))
    return " ".join([word for word in str(text).split() if word not in stopwords])
    
# function to remove repeating characters
def remove_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# function to remove numeric text
def remove_numeric(text):
    return re.sub('[0-9]+', '', text)

def remove_non_russian_words(text):
    russian_words_pattern = re.compile(r'\b[а-яА-ЯёЁ]+\b')
    russian_words = russian_words_pattern.findall(text)
    cleaned_text = ' '.join(russian_words)
    return cleaned_text

# lemmatizing the text. i.e, Converting some of the words to their root form. 
def text_lematization(text):
    mystem = Mystem() 
    text = mystem.lemmatize(text)
    text = [word for word in text if word!= ' ']
    return text 

In [None]:
X = X.apply(convert_to_lowercase)

In [None]:
X = X.apply(remove_punctuations)

In [None]:
X = X.apply(remove_stopwords)

In [None]:
X = X.apply(remove_repeating_characters)

In [None]:
X = X.apply(remove_numeric)

In [None]:
X = X.apply(remove_non_russian_words)

In [None]:
X = X.apply(text_lematization)

In [None]:
X_test = X[:len(X_test)]
X_train = X[len(X_test):(len(X_train)+len(X_test))]
X_val = X[len(X_val):]

### TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [None]:
train = vectorizer.fit_transform(X_train)
val = vectorizer.transform(X_val)
test = vectorizer.transform(X_test)

In [None]:
feature_names = vectorizer.get_feature_names_out()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Training the model
model = LogisticRegression()
model.fit(train, y_train)

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(test)
print(classification_report(y_test, y_pred))

### Simple XGBoost classifier

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier(n_estimators = 500, max_depth = 5, learning_rate = 0.05, eval_metric='logloss')
xgb_model.fit(train, y_train)

**XGBoost prediction**

In [None]:
y_pred = xgb_model.predict(test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, y_pred))

### XGBoost optimization

In [None]:
import optuna

In [None]:
from sklearn.metrics import f1_score

In [None]:
def objective(trial):
   params = {
       'objective': 'binary:logistic',
       'eval_metric': 'logloss', 
       'num_class': 1,
       'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
       'max_depth': trial.suggest_int('max_depth', 3, 7),
       # 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
       # 'subsample': trial.suggest_float('subsample', 0.5, 0.8),
       # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
       'n_estimators': trial.suggest_int('n_estimators', 100, 1000)
   }

   # Создание и обучение модели
   model = XGBClassifier(**params)
   model.fit(train, y_train)

   # Предсказание и оценка
   y_pred = model.predict(val)
   f1score = f1_score(y_val, y_pred)

   return f1score


In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=100)  # Вы можете изменить количество итераций

print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

In [None]:
model_optimized = XGBClassifier(**study.best_params)
model_optimized.fit(train, y_train)

# Предсказание и оценка
y_pred = model_optimized.predict(test)
f1score = f1_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

### Simple CatBoost classifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=3,
    eval_metric='Logloss',
    verbose=100
)

model.fit(train, y_train)

**CatBoost Prediction**

In [None]:
y_pred = model.predict(test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, y_pred))

### Dimensionality reduction of feature space

In [None]:
feature_names = vectorizer.get_feature_names_out()

In [None]:
importance = model_optimized.feature_importances_
feature_importance = dict(zip(feature_names, importance))

sorted_features = sorted(
    feature_importance.items(), 
    key=lambda x: x[1], 
    reverse=True
)

In [None]:
top_features = [x[0] for x in sorted_features]  
n_features = 2154 

X_top = train[top_features[:n_features]]

In [None]:
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(
    X_top, labels, test_size=0.2, random_state=42
)

optimized_model = xgb.XGBClassifier(
    n_estimators=500,  
    learning_rate=0.05,
    max_depth=5, 
    random_state=42
)
optimized_model.fit(X_train_new, y_train_new)

In [None]:
xgb_model = XGBClassifier(n_estimators = 500, max_depth = 5, learning_rate = 0.05, eval_metric='logloss')

xgb_model.fit(train, y_train)

In [None]:
y_pred = xgb_model.predict(test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, y_pred))

### Word2Vec

In [None]:
from gensim.models import Word2Vec

# Параметры модели:
vector_size = 100  # Размерность вектора слова
window = 5         # Размер окна контекста
min_count = 20      # Игнорировать слова с частотой < 20

model = Word2Vec(
    sentences=X.message_txt,
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    workers=4       # Число ядер CPU
)

# Сохранение модели
model.save("word2vec.model")

In [None]:
def text_to_vector(tokens):
    vectors = []
    for word in tokens:
        if word in model.wv:
            vectors.append(model.wv[word])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

tmp = np.array([text_to_vector(tokens) for tokens in X.message_txt])

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, y_pred))

### Оптимизация Word2Vec x XGBoost 

In [None]:
import optuna

In [None]:
from sklearn.metrics import f1_score

In [None]:
def objective(trial):
   params = {
       'objective': 'binary:logistic',
       'eval_metric': 'logloss', 
       'num_class': 1,
       'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
       'max_depth': trial.suggest_int('max_depth', 3, 10),
       # 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
       # 'subsample': trial.suggest_float('subsample', 0.5, 0.8),
       # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
       'n_estimators': trial.suggest_int('n_estimators', 100, 5000)
   }

   # Создание и обучение модели
   model = XGBClassifier(**params)
   model.fit(X_train, y_train)

   # Предсказание и оценка
   y_pred = model.predict(X_val)
   f1score = f1_score(y_val, y_pred)

   return f1score


In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=100)  # Вы можете изменить количество итераций

print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

### Optimized Word2Vec x XGBoost 

In [None]:
xgb_opt = XGBClassifier(**study.best_params)
xgb_opt.fit(train, y_train)

In [None]:
y_pred = xgb_opt.predict(test)

print(classification_report(y_test, y_pred))

### Simple Catboost Classifier x TF-IDF

In [None]:
from catboost import CatBoostClassifier

In [None]:
model_ctb = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=3,
    eval_metric='Logloss',
    verbose=100
)

model_ctb.fit(train, y_train)

In [None]:
y_pred = model_ctb.predict(test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

print(classification_report(y_test, y_pred))

### Catboost optimization x TF-IDF

In [None]:
def objective(trial):
   params = {
       'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1), 
        'verbose': False
   }

   # Создание и обучение модели
   model = CatBoostClassifier(**params)
   model.fit(train, y_train)

   # Предсказание и оценка
   y_pred = model.predict(val)
   f1score = f1_score(y_val, y_pred)

   return f1score


In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=100)  # Вы можете изменить количество итераций

print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

In [None]:
ctb_opt = CatBoostClassifier(**study.best_params)
ctb_opt.fit(train, y_train)

In [None]:
y_pred = ctb_opt.predict(test)

print(classification_report(y_test, y_pred))

### CatBoost x Word2Vec

In [None]:
model_ctb_w2v = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=3,
    eval_metric='Logloss',
    verbose=100
)

model_ctb_w2v.fit(X_train, y_train)

In [None]:
y_pred = model_ctb_w2v.predict(X_test)

print(classification_report(y_test, y_pred))

### CatBoost x Word2Vec optimized

In [None]:
def objective(trial):
   params = {
       'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'verbose': True
   }

   # Создание и обучение модели
   model = CatBoostClassifier(**params)
   model.fit(X_train, y_train)

   # Предсказание и оценка
   y_pred = model.predict(X_val)
   f1score = f1_score(y_val, y_pred)

   return f1score

In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=100)  # Вы можете изменить количество итераций

print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

In [None]:
ctb_opt_w2v = CatBoostClassifier(**study.best_params)
ctb_opt_w2v.fit(X_train, y_train)

In [None]:
y_pred = ctb_opt_w2v.predict(X_test)

print(classification_report(y_test, y_pred))