In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import os

# Чтение данных
train = pd.read_csv('train_sample.csv')
test = pd.read_csv('test_sample.csv')

# Подготовка данных
X_train = train.drop(columns=['class'])
y_train = train['class']
X_test = test.copy()

# 1. XGBoost с кросс-валидацией
dtrain = xgb.DMatrix(X_train, label=y_train)

params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 0
}

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=100,
    nfold=5,
    stratified=True,
    early_stopping_rounds=10,
    seed=0
)

best_rounds = cv_results.shape[0]

# Обучение финальной модели XGBoost
model_xgb = xgb.train(params, dtrain, num_boost_round=best_rounds)
dtest = xgb.DMatrix(X_test)
prediction = model_xgb.predict(dtest)

# 2. Random Forest
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

# Находим самую важную фичу
importances = rf.feature_importances_
rf_most_important = np.argmax(importances) + 1  # Индексация с 1

# Сохранение результатов
np.savez('submission.npz',
         rf_most_important=rf_most_important,
         prediction=prediction)

# Проверка содержимого
npzfile = np.load('submission.npz')
print(f"Самая важная фича Random Forest: {npzfile['rf_most_important']}")
print(f"Форма предсказаний XGBoost: {npzfile['prediction'].shape}")
print("\nФайл 'submission.npz' готов для загрузки")

Самая важная фича Random Forest: 7
Форма предсказаний XGBoost: (500,)

Файл 'submission.npz' готов для загрузки
