# Соревнование Kaggle по линейной регрессии

https://www.kaggle.com/t/06c22d6d07604685974b8bf285ce5466


In [1]:
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

data_path = os.getenv('LOCAL_DATA')

In [2]:
train_data_path = os.path.join(data_path, "competition_1", "prices_train.csv")
test_data_path = os.path.join(data_path, "competition_1", "prices_test.csv")

In [3]:
train_data = pd.read_csv(train_data_path, index_col=0)
test_data = pd.read_csv(test_data_path, index_col=0)

print(f"Объем тренировочной выборки: {train_data.shape}, \nОбъем тестовой выборки: {test_data.shape}")

train_data

Объем тренировочной выборки: (331, 7), 
Объем тестовой выборки: (83, 6)


Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2013.083,34.0,157.6052,7.0,24.96628,121.54196,39.1
1,2013.500,13.3,561.9845,5.0,24.98746,121.54391,54.8
2,2012.917,13.7,1236.5640,1.0,24.97694,121.55391,30.6
3,2013.500,8.5,104.8101,5.0,24.96674,121.54067,55.5
4,2013.500,13.2,1164.8380,,24.99156,121.53406,34.3
...,...,...,...,...,...,...,...
326,2013.417,38.5,216.8329,7.0,24.98086,121.54162,41.0
327,2013.417,9.7,421.4790,5.0,24.98246,121.54477,49.3
328,2012.750,0.0,208.3905,6.0,24.95618,121.53844,44.0
329,2013.167,16.2,2288.0110,3.0,24.95885,121.51359,24.4


In [7]:
from ydata_profiling import ProfileReport

profile = ProfileReport(train_data, title="Profiling Report")

In [8]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:00<?, ?it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [9]:
profile.to_file("your_report.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
mode = train_data['X2 house age'].mode(dropna=True)
train_data['X2 house age'].fillna(mode[0], inplace=True)

In [None]:
mean = train_data['X3 distance to the nearest MRT station'].mean()
train_data['X3 distance to the nearest MRT station'].fillna(mean, inplace=True)

In [None]:
train_data['X4 number of convenience stores'].fillna(0, inplace=True)

In [8]:
import statsmodels.api as sm

def stepwise_selection(X, y, initial_list=[], threshold_in=0.01, threshold_out=0.05):
    """
    Пошаговый отбор признаков для задачи регрессии (OLS).

    :param X: pd.DataFrame — матрица признаков
    :param y: pd.Series — целевая переменная (непрерывная)
    :param initial_list: list — начальный список признаков
    :param threshold_in: float — порог для добавления признака
    :param threshold_out: float — порог для удаления признака
    :return: список отобранных признаков
    """
    included = list(initial_list)
    
    while True:
        changed = False

        # Шаг вперёд
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded, dtype=float)
        for new_col in excluded:
            try:
                model = sm.OLS(y, sm.add_constant(X[included + [new_col]])).fit()
                new_pval[new_col] = model.pvalues[new_col]
            except Exception:
                continue
        
        if not new_pval.empty:
            best_pval = new_pval.min()
            if best_pval < threshold_in:
                best_feature = new_pval.idxmin()
                included.append(best_feature)
                changed = True
                print(f"Добавлен признак: {best_feature} с p-value {best_pval:.4f}")
        
        # Шаг назад
        if included:
            model = sm.OLS(y, sm.add_constant(X[included])).fit()
            pvalues = model.pvalues.drop('const', errors='ignore')
            worst_pval = pvalues.max()
            if worst_pval > threshold_out:
                worst_feature = pvalues.idxmax()
                included.remove(worst_feature)
                changed = True
                print(f"Удален признак: {worst_feature} с p-value {worst_pval:.4f}")

        if not changed:
            break

    return included

In [9]:
X = train_data.drop('Y house price of unit area', axis=1)
y = train_data['Y house price of unit area']

In [10]:
selected_features = stepwise_selection(X, y)

Добавлен признак: X3 distance to the nearest MRT station с p-value 0.0000
Добавлен признак: X2 house age с p-value 0.0000
Добавлен признак: X5 latitude с p-value 0.0000
Добавлен признак: X4 number of convenience stores с p-value 0.0000
Добавлен признак: X1 transaction date с p-value 0.0012


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.api as sm
import numpy as np


In [12]:
test_data.isna().sum()

X1 transaction date                       0
X2 house age                              0
X3 distance to the nearest MRT station    5
X4 number of convenience stores           7
X5 latitude                               0
X6 longitude                              0
dtype: int64

In [None]:
test_data['X3 distance to the nearest MRT station'].fillna(mean, inplace=True)
test_data['X4 number of convenience stores'].fillna(0, inplace=True)

In [14]:
X_selected = X[selected_features]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nМетрики на тесте:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"R^2: {r2:.4f}")


Метрики на тесте:
MAE: 6.8599
MSE: 82.3134
R^2: 0.5879


In [21]:
X_test_scaled = scaler.transform(test_data[selected_features])

In [None]:
y_pred = lr.predict(X_test_scaled)
submission = pd.DataFrame({
    'index': test_data.index,
    'Y house price of unit area': y_pred
})

submission.to_csv("submission.csv", index=False)