In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'dl-nlp-nes-2024:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F75047%2F8224096%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240522%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240522T120452Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D38d0bba523dada8cd267d29a8fd251c54d6a28171cf85a7d60fa456e6bd5a62784a40e66aa5870e83c65e4783732e3f843030ab6ead47ee81ad1d90aa12dbe8f7bf2d15f9487ea29c927a038dfa461856551eca3b5b9c9c259f244ee384864075f313168226ecd2d80a925755ca5a81b23837988f9cacf703228f902bc605963f3ed976c5de523cd7ac4809d97eb2f4cf36672c4192e4014bcb817d53b8a7f2b41cd94e31d262448d1db3eb509b82d95965c3eab89bb847217b1092a2e166245736babcfe667678e66db6e9f1fd2eef4f8e1e1d5519bc47b0588d48eee94cb8cea18f7d335246fadf795f511e51a1028588a9da8dd724b0cc9cca594303ce4b5'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading dl-nlp-nes-2024, 16046 bytes compressed
Downloaded and uncompressed: dl-nlp-nes-2024
Data source import complete.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dl-nlp-nes-2024/sample_submission_random.csv
/kaggle/input/dl-nlp-nes-2024/names_test.csv
/kaggle/input/dl-nlp-nes-2024/names_train.csv


In [None]:
pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein)
  Downloading rapidfuzz-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.25.1 rapidfuzz-3.9.1


In [None]:
pip install pyphen

Collecting pyphen
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.1 MB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m30.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen
Successfully installed pyphen-0.15.0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
import Levenshtein
import random
import pyphen

train_df = pd.read_csv('/kaggle/input/dl-nlp-nes-2024/names_train.csv')
test_df = pd.read_csv('/kaggle/input/dl-nlp-nes-2024/names_test.csv')

def add_typo(name):
    if len(name) < 2:
        return name
    pos = random.randint(0, len(name) - 2)
    return name[:pos] + name[pos + 1] + name[pos] + name[pos + 2:]

def substitute_character(name):
    substitutions = {'a': 'o', 'o': 'a', 'e': 'i', 'i': 'e', 's': 'z', 'z': 's'}
    name = list(name)
    for i in range(len(name)):
        if name[i] in substitutions:
            name[i] = substitutions[name[i]]
    return ''.join(name)

def generate_synthetic_data(df):
    synthetic_data = []
    for _, row in df.iterrows():
        name1 = row['first_name']
        name2 = row['first_name_draft']
        category = row['Category']

        synthetic_data.append([name1, name2, category])
        synthetic_data.append([add_typo(name1), name2, category])
        synthetic_data.append([name1, add_typo(name2), category])
        synthetic_data.append([substitute_character(name1), name2, category])
        synthetic_data.append([name1, substitute_character(name2), category])
        synthetic_data.append([name1[::-1], name2, category])  # добавление перевёрнутых имён
        synthetic_data.append([name1, name2[::-1], category])  # добавление перевёрнутых имён

    synthetic_df = pd.DataFrame(synthetic_data, columns=['first_name', 'first_name_draft', 'Category'])
    return synthetic_df

synthetic_train_df = generate_synthetic_data(train_df)
train_df_extended = pd.concat([train_df, synthetic_train_df]).reset_index(drop=True)

def count_syllables(word):
    dic = pyphen.Pyphen(lang='en')
    syllables = dic.inserted(word)
    return len(syllables.split('-'))

def create_features(df):
    df['levenshtein_distance'] = df.apply(lambda row: Levenshtein.distance(row['first_name'], row['first_name_draft']), axis=1)
    df['jaro_distance'] = df.apply(lambda row: Levenshtein.jaro(row['first_name'], row['first_name_draft']), axis=1)
    df['ratio_distance'] = df.apply(lambda row: Levenshtein.ratio(row['first_name'], row['first_name_draft']), axis=1)
    df['length_difference'] = df.apply(lambda row: abs(len(row['first_name']) - len(row['first_name_draft'])), axis=1)
    df['common_characters'] = df.apply(lambda row: len(set(row['first_name']).intersection(set(row['first_name_draft']))), axis=1)
    df['different_characters'] = df.apply(lambda row: len(set(row['first_name']).difference(set(row['first_name_draft']))), axis=1)
    df['starts_with_same_letter'] = df.apply(lambda row: row['first_name'][0] == row['first_name_draft'][0], axis=1)
    df['ends_with_same_letter'] = df.apply(lambda row: row['first_name'][-1] == row['first_name_draft'][-1], axis=1)
    df['n_grams_overlap'] = df.apply(lambda row: len(set([row['first_name'][i:i+2] for i in range(len(row['first_name']) - 1)]).intersection(set([row['first_name_draft'][i:i+2] for i in range(len(row['first_name_draft']) - 1)]))), axis=1)
    df['syllables_first_name'] = df['first_name'].apply(count_syllables)
    df['syllables_first_name_draft'] = df['first_name_draft'].apply(count_syllables)
    df['syllable_difference'] = abs(df['syllables_first_name'] - df['syllables_first_name_draft'])
    return df

train_df_extended = create_features(train_df_extended)
test_df = create_features(test_df)

X = train_df_extended[['levenshtein_distance', 'jaro_distance', 'ratio_distance', 'length_difference',
                       'common_characters', 'different_characters', 'starts_with_same_letter',
                       'ends_with_same_letter', 'n_grams_overlap', 'syllables_first_name',
                       'syllables_first_name_draft', 'syllable_difference']]
y = train_df_extended['Category']
X_test = test_df[['levenshtein_distance', 'jaro_distance', 'ratio_distance', 'length_difference',
                  'common_characters', 'different_characters', 'starts_with_same_letter',
                  'ends_with_same_letter', 'n_grams_overlap', 'syllables_first_name',
                  'syllables_first_name_draft', 'syllable_difference']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='accuracy')
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_gb = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid_gb, cv=3, scoring='accuracy')
grid_gb.fit(X_train, y_train)
best_gb = grid_gb.best_estimator_

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5],
    'min_child_weight': [1, 3],
    'subsample': [0.8, 1.0]
}
grid_xgb = GridSearchCV(XGBClassifier(random_state=42), param_grid_xgb, cv=3, scoring='accuracy')
grid_xgb.fit(X_train, y_train)
best_xgb = grid_xgb.best_estimator_

param_grid_lgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5],
    'num_leaves': [31, 63],
    'min_child_samples': [20, 30],
    'subsample': [0.8, 1.0]
}
grid_lgb = GridSearchCV(LGBMClassifier(random_state=42), param_grid_lgb, cv=3, scoring='accuracy')
grid_lgb.fit(X_train, y_train)
best_lgb = grid_lgb.best_estimator_

estimators = [
    ('lr', LogisticRegression(random_state=42)),
    ('rf', best_rf),
    ('gb', best_gb),
    ('xgb', best_xgb),
    ('lgb', best_lgb)
]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=SVC(kernel='linear', probability=True, random_state=42))

cross_val_scores = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='accuracy')
print(f'Cross-Validation Accuracy: {cross_val_scores.mean():.4f}')

stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

test_df['Category'] = stacking_model.predict(X_test)
submission = test_df[['Id', 'Category']]
submission.to_csv('submission.csv', index=False)

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
[LightGBM] [Info] Number of positive: 2648, number of negative: 3172
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 292
[LightGBM] [Info] Number of data points in the train set: 5820, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.454983 -> initscore=-0.180558
[LightGBM] [Info] Start training from score -0.180558
[LightGBM] [Info] Number of positive: 2648, number of negative: 3172
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 285
[LightGBM] [Info] Number of data poi

Расширяем тренировочный датасет с помощью синтетических данных. Они генерируются с помощью следующих функций:
*   add_typo: добавляет опечатку в случайной позиции в имени.
*   substitute_character: заменяет определенные символы в имени на другие.
*   generate_synthetic_data: генерирует синтетические данные, создавая различные вариации имен.

Создаем много разных признаков для улучшения качества модели:

*   Расстояние Левенштейна
*   Расстояние Джаро-Винклера
*   Пропорция Левенштейна (Levenshtein ratio)
*   Модуль разницы длин
*   Количество общих символов (в данном случае букв)
*   Количество разных букв
*   Начинается с одной буквы (катег)
*   Заканчивается одной буквой (катег)
*   Количество слогов и другие

Делим данные на тренировочную и валидационную выборки в соотношении 80/20 для оценки модели.

Настраиваем гиперпараметры для RandomForestClassifier с использованием GridSearchCV и кросс-валидации (3-кратной), аналогично для GradientBoostingClassifier, XGBoostClassifier и LightGBMClassifier.

Создаем компбинацию лучших моделей с использованием StackingClassifier. Логистическая регрессия, RandomForest, GradientBoosting, XGBoost и LightGBM используются как базовые модели, а SVC с линейным ядром используется как финальный классификатор.

Обучаем стекинг модели на всей тренировочной выборке.

Конец!























