In [99]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Загрузка данных

In [100]:
# Считываем тренировочные данные
train = pd.read_csv("train.csv")
formula_train = pd.read_csv("formula_train.csv")

print(f"Train dataset shape: {train.shape}")
print(f"Train_formula dataset shape: {formula_train.shape}")

# Считываем тестовые данные
test = pd.read_csv("test.csv")
formula_test = pd.read_csv("formula_test.csv")

print(f"Test dataset shape: {test.shape}")
print(f"Test_formula dataset shape: {formula_test.shape}")

Train dataset shape: (17010, 82)
Train_formula dataset shape: (17010, 88)
Test dataset shape: (4253, 81)
Test_formula dataset shape: (4253, 87)


# Формирование датасета для обучения

In [101]:
formula_train = formula_train.drop(columns=['critical_temp'])

train_full = pd.concat([train, formula_train], axis=1)

# Удалим из данных ненужную колонку 'material'
train_full.drop(columns=['material'], inplace=True)

print(f"Full Train dataset shape: {train_full.shape}")

Full Train dataset shape: (17010, 168)


In [102]:
X = train_full.drop(columns=['critical_temp'])
y = train_full['critical_temp']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (17010, 167)
Target shape: (17010,)


In [103]:
X = train_full.drop(columns=['critical_temp'])
y = train_full['critical_temp']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (17010, 167)
Target shape: (17010,)


In [104]:
test_full = pd.concat([test, formula_test], axis=1)

# Удалим из данных ненужную колонку 'material'
test_full.drop(columns=['material'], inplace=True)

print(f"Full Test dataset shape: {test_full.shape}")

Full Test dataset shape: (4253, 167)


In [105]:
# Разделение данных на dataframe для обучения и тестирования
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(f'Train dataset size: {X_train.shape}, {y_train.shape}')
print(f'Train dataset size: {X_test.shape}, {y_test.shape}')

Train dataset size: (11907, 167), (11907,)
Train dataset size: (5103, 167), (5103,)


# Обучение модели

In [106]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# Обучение модели
model = GradientBoostingRegressor(n_estimators=100, max_depth=5)

# Поиск межквартильного размаха
Q1 = train_full['critical_temp'].quantile(0.25)
Q3 = train_full['critical_temp'].quantile(0.75)
IQR = Q3 - Q1

# Определение границ выбросов
lower_bound = Q1 - 0.25 * IQR
upper_bound = Q3 + 0.25 * IQR

# Удаление выбросов из обучающего датасета
train_full = train_full[(train_full['critical_temp'] >= lower_bound) & (train_full['critical_temp'] <= upper_bound)]

# Обновление X и y после удаления выбросов
X = train_full.drop(columns=['critical_temp'])
y = train_full['critical_temp']



model.fit(X_train, y_train, sample_weight=sample_weights)


In [108]:
# массив названия признаков
features = test_full.columns
coeff_df = pd.DataFrame(model.coef_, columns=['Coefficient'])  
coeff_df['features'] = features

coeff_df.sort_values(by='Coefficient')

AttributeError: 'GradientBoostingRegressor' object has no attribute 'coef_'

In [109]:
# Предсказание

y_pred = model.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R2 score:', r2_score(y_test, y_pred))

Mean Absolute Error: 6.977398691573442
Mean Squared Error: 125.81100918836856
R2 score: 0.8936167701487475


In [110]:
y_pred = pd.DataFrame(y_pred, columns=['critical_temp'])

y_pred = y_pred.reset_index()

y_pred.to_csv('predict.csv', index=False)