In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score


import nltk
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
stemmer = SnowballStemmer("english")#для леммы
stop_words = set(stopwords.words('english'))# мусор типа и, а, ...
reg_tok = RegexpTokenizer(r'[A-Za-z#@\d]+')#разделитель

In [None]:
train_and_val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
train_and_val = train_and_val.fillna('')

Разделю выборки

In [None]:
train, val = train_test_split(
    train_and_val,
    test_size=0.2,
    random_state=42,
    stratify=train_and_val["target"]
)
train = train.reset_index(drop=True)
val = val.reset_index(drop=True)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')
test = test.fillna('')

Приведу текст к удобному виду

In [None]:
def drop_bed_words(text):
  tokens = reg_tok.tokenize(re.sub(r'[^A-Za-z#@ ]+','',text))#разделяем
  lemmatized_words = [stemmer.stem(word) for word in tokens if word not in stop_words]
  return ' '.join(lemmatized_words)

Частота для каждого ключего слова(локации) относительно таргета

In [None]:
mean_encoding_keyword = [stemmer.stem(word) for word in train.groupby('keyword')['target'].mean().to_dict()]
mean_encoding_location = train.groupby('location')['target'].mean().to_dict()

Дальше я подбирал алгоритм обработки и параметры на трейне и валидации

In [None]:
vectorizer = TfidfVectorizer(min_df=5)
X_train_words = vectorizer.fit_transform(train['text'].apply(drop_bed_words))
X_train = pd.DataFrame(X_train_words.toarray(), columns=vectorizer.get_feature_names_out())
#X_train['KEYWORD'] = train['keyword'].map(mean_encoding_keyword).fillna(mean_encoding_keyword[''])
#X_train['LOCATION'] = train['location'].map(mean_encoding_location).fillna(mean_encoding_location[''])
for i in mean_encoding_keyword:
  for j in i.split('%20'):
    if j in X_train.columns:
      X_train[j] = X_train[j]**0.61

In [None]:
clf = RidgeClassifier()
clf.fit(X_train, train["target"])

In [None]:
X_val_words = vectorizer.transform(val['text'].apply(drop_bed_words))
X_val = pd.DataFrame(X_val_words.toarray(), columns=vectorizer.get_feature_names_out())
#X_val['KEYWORD'] = val['keyword'].map(mean_encoding_keyword).fillna(mean_encoding_keyword[''])
#X_val['LOCATION'] = val['location'].map(mean_encoding_location).fillna(mean_encoding_location[''])
for i in mean_encoding_keyword:
  for j in i.split('%20'):
    if j in X_val.columns:
      X_val[j] = X_val[j]**0.61

In [None]:
f1_score(val['target'],clf.predict(X_val))

0.7670364500792393

Дообучю на всей выбоке уже с подобранными парамерами

In [None]:
full_mean_encoding_keyword = train_and_val.groupby('keyword')['target'].mean().to_dict()
full_vectorizer = TfidfVectorizer(min_df=5)

X_words = full_vectorizer.fit_transform(train_and_val['text'].apply(drop_bed_words))
X = pd.DataFrame(X_words.toarray(), columns=full_vectorizer.get_feature_names_out())

for i in full_mean_encoding_keyword:
  for j in i.split('%20'):
    if j in X.columns:
      X[j] = X[j]**0.61
full_clf = RidgeClassifier()
full_clf.fit(X, train_and_val["target"])


Делаю предсказание на тесте

In [None]:
X_test_words = full_vectorizer.transform(test['text'].apply(drop_bed_words))
X_test = pd.DataFrame(X_test_words.toarray(), columns=full_vectorizer.get_feature_names_out())
#X_test['KEYWORD'] = test['keyword'].map(mean_encoding_keyword).fillna(mean_encoding_keyword[''])
#X_test['LOCATION'] = test['location'].map(mean_encoding_location).fillna(mean_encoding_location[''])
#X_test
for i in full_mean_encoding_keyword:
  for j in i.split('%20'):
    if j in X_test.columns:
      X_test[j] = X_test[j]**0.61

In [None]:
test_final = pd.DataFrame({
    'id':test['id'],
    'target':full_clf.predict(X_test)})
test_final.to_csv("/content/drive/MyDrive/Colab Notebooks/submission.csv", index=False)