### Антипов Д. С. - KNN

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [None]:
# Загрузка датасета
_data = pd.read_csv('../train_final.csv')

cols = [
	'meal',
	'country',
	'market_segment',
	'distribution_channel',
	'required_car_parking_spaces'
]

_data = _data[cols].copy()

In [None]:
for col_name in cols:
	x = _data[col_name].unique().tolist()
	print(col_name, len(x), x)

In [None]:
_data.isnull().sum()

In [None]:
data = _data.copy()

# Признак "нужно ли парковочное место"
data['need_parking'] = data['required_car_parking_spaces'].apply(lambda x: 1 if x > 0 else 0)

# Признак "нужно ли питание"
meal_map = {
	'SC': 0,
	'BB': 1,
	'HB': 2,
	'FB': 3,
	'Undefined': 0
}
data['need_meal'] = data['meal'].map(meal_map)

# Признак "полный пансион"
country_freq = data['country'].value_counts(normalize=True)
data['country_frequency'] = data['country'].map(country_freq)

# Признак "отношение канала бронирования и сегмента рынка"
data['channel_segment'] = data['distribution_channel'] + '_' + data['market_segment']

top_combinations = data['channel_segment'].value_counts().head(15).index
data['channel_segment'] = data['channel_segment'].apply(lambda x: x if x in top_combinations else 'Other')

encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(data[['channel_segment']])
encoded_cols = encoder.get_feature_names_out(["channel_segment"])
encoded_df = pd.DataFrame(encoded, columns=encoded_cols)
data = pd.concat([data, encoded_df], axis=1)
data.drop('channel_segment', axis=1, inplace=True)

# Признак "market_segment" но варианты с наименьшими появлениями обобщены в 'Other'
top_market_segment = data['market_segment'].value_counts().head(3).index
data['market_segment_knn'] = data['market_segment'].apply(lambda x: x if x in top_market_segment else 'Other')

data.head(10)

In [None]:
# Предсказание market_segment

y = data.iloc[:, -1]	# Целевая переменная
X = data.iloc[:, 5:-1]	# Признаки (все столбцы кроме Целевая)

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3003)

# Создание и обучение модели KNN
knn = KNeighborsClassifier(n_neighbors=12, weights='distance')
knn.fit(X_train, y_train)

# Предсказание на тестовых данных
y_pred = knn.predict(X_test)

# Оценка модели
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))