In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

### Подготовка данных

In [2]:
test_data = pd.read_csv('test.csv')

FileNotFoundError: [Errno 2] File b'test.csv' does not exist: b'test.csv'

In [None]:
test_data.info()

In [None]:
train_data = pd.read_csv('train.csv')

In [None]:
train_data.info()

### Ищем на графиках ошибочные данные

In [None]:
plt.scatter(train_data.Rooms, train_data.Square)

In [None]:
plt.scatter(train_data.Rooms, train_data.Price)

In [None]:
plt.scatter(train_data.Square, train_data.LifeSquare)

In [None]:
# Убираем все экстримальные значения

### Функции для очистки и подготовки данных

In [None]:
mean_year = np.round(train_data.loc[train_data['HouseYear'] <= 2020, 'HouseYear'].mean())
mean_healthcare = np.round(train_data["Healthcare_1"].mean())
mean_square_for_max = train_data.loc[(train_data['Rooms'] <= train_data.loc[(train_data['Square'] > 300), 'Rooms'].mean()), 'Square'].mean()
mean_square_for_big_ls = train_data.loc[train_data['LifeSquare'] > 250, 'Square'].mean()
mean_life_squae_for_max = train_data.loc[train_data['Square'] >= mean_square_for_big_ls, 'LifeSquare'].mean()

In [None]:
def clean_year(df, mean_year):
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = mean_year

In [None]:
def clean_life_square(df, mean_life_squae_for_max):
    df.loc[(df['LifeSquare'] < 5) | (df['LifeSquare'].isnull()), 'LifeSquare'] = df['Square']*0.85
    df.loc[df['LifeSquare'] > 250, 'LifeSquare'] = mean_life_squae_for_max

In [None]:
def clean_square(df, mean_square_for_max):
    df.loc[(df['Square'] > 300), 'Square'] = mean_square_for_max

In [None]:
def clean_healthcare_1(df, mean_healthcare):
    df.loc[df['Healthcare_1'].isnull(), 'Healthcare_1'] = mean_healthcare

In [None]:
def clean_rooms(df):
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] < 30), 'Rooms'] = 1
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 30) & (df['LifeSquare'] < 45), 'Rooms'] = 2
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 45) & (df['LifeSquare'] < 60), 'Rooms'] = 3
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 60) & (df['LifeSquare'] < 75), 'Rooms'] = 4
    df.loc[(df['Rooms'] < 1) & (df['LifeSquare'] > 70), 'Rooms'] = 6
    df.loc[(df['Rooms'] > 10), 'Rooms'] = 2

In [None]:
def prepare_data(df, mean_year=mean_year, mean_healthcare=mean_healthcare, mean_square_for_max=mean_square_for_max, mean_life_squae_for_max=mean_life_squae_for_max):
    clean_year(df, mean_year)
    clean_life_square(df, mean_life_squae_for_max)
    clean_healthcare_1(df, mean_healthcare)
    clean_rooms(df)
    clean_square(df, mean_square_for_max)

In [None]:
prepare_data(train_data)
prepare_data(test_data)

In [None]:
train_data.info()

### Проверяем на графиках что избавились от ошибочных данных

In [None]:
plt.scatter(train_data.Rooms, train_data.Square)

In [None]:
plt.scatter(train_data.Rooms, train_data.Price)

In [None]:
plt.scatter(train_data.Square, train_data.LifeSquare)

### Убираем лишние поля и делим на train и valid

In [None]:
X = pd.get_dummies(train_data)
X.drop("Price", axis=1, inplace=True)
X.drop("Id", axis=1, inplace=True)
X.info()

In [None]:
y = train_data.Price
y.count()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

### Создаем и обучаем модель

In [None]:
model = RandomForestRegressor(n_estimators=1000, max_depth=18, random_state=42, max_features=7)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_valid)
y_pred_train = model.predict(X_train)

In [None]:
# Предсказываем на valid и train данных и проверяем метрики

In [None]:
# Для train
r2_score(y_train, y_pred_train)

In [None]:
# Для valid
r2_score(y_valid, y_pred)

In [None]:
model.feature_importances_

### Предсказываем цены для тестовых данных и выгружаем в файл

In [None]:
X_test = pd.get_dummies(test_data)
X_test.drop("Id", axis=1, inplace=True)
test_data["Price"] = model.predict(X_test)

In [None]:
test_data.loc[:, ['Id', 'Price']].to_csv('Kolch_predictions.csv', index=False)