In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score as r2
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
matplotlib.rcParams.update({'font.size': 14})

In [2]:
def transform(data):
    clean_data = data.loc[
        (data['Rooms'] > 0) & data['Rooms'].notnull() &
        (data['Square'] > 0) & data['Square'].notnull() &
        (data['LifeSquare'] > 0) & data['LifeSquare'].notnull() &
        (data['KitchenSquare'] > 0) & data['KitchenSquare'].notnull() &
        (data['Floor'] > 0) & data['Floor'].notnull() &
        (data['HouseFloor'] > 0) & data['HouseFloor'].notnull() &
        (data['Square'] > data['LifeSquare'] + data['KitchenSquare']) &
        (data['Floor'] <= data['HouseFloor']) &
        (data['Healthcare_1'] > 0) & data['Healthcare_1'].notnull() &
        (data['Helthcare_2'] > 0) & data['Helthcare_2'].notnull()
    ]
    data.loc[(data['Rooms'] == 0) | (data['Rooms'].isnull()), 'Rooms'] = clean_data['Rooms'].quantile(q=0.5)
    data.loc[(data['Square'] == 0) | data['Square'].isnull(), 'Square'] = clean_data['Square'].quantile(q=0.5)
    data.loc[(data['LifeSquare'] == 0) | data['LifeSquare'].isnull(), 'LifeSquare'] = clean_data['LifeSquare'].quantile(q=0.5)
    data.loc[(data['KitchenSquare'] == 0) | data['KitchenSquare'].isnull(), 'KitchenSquare'] = clean_data['KitchenSquare'].quantile(q=0.5)
    data.loc[(data['Floor'] > data['HouseFloor']), 'Floor'] = data['HouseFloor']
    data.loc[(data['Floor'] == 0) | data['Floor'].isnull(), 'Floor'] = clean_data['Floor'].quantile(q=0.5)
    data.loc[(data['HouseFloor'] == 0) | data['HouseFloor'].isnull(), 'HouseFloor'] = data['Floor']
    data.loc[(data['Square'] < data['LifeSquare'] + data['KitchenSquare']), 'Square'] = (data['LifeSquare'] + data['KitchenSquare']) 
    data.loc[(data['Healthcare_1'] == 0) | (data['Healthcare_1'].isnull()), 'Healthcare_1'] = clean_data['Healthcare_1'].quantile(q=0.5)
    data.loc[(data['Helthcare_2'] == 0) | (data['Helthcare_2'].isnull()), 'Helthcare_2'] = clean_data['Helthcare_2'].quantile(q=0.5)
       
    return data

def fit(data):
    proposal = data['DistrictId'].value_counts()
    data['proposal_district'] = proposal[data['DistrictId']].values
    data['square_one_room'] = data['Square'] / data['Rooms']
    data['social'] = data['Social_1'] * data['Social_2'] * data['Social_3']
    
    return data
        
        
TRAIN_DATASET_PATH = 'data/train.csv' # y_train, y_valid
TEST_DATASET_PATH = 'data/test.csv'
TEST_TARGET_PATH = 'data/sample_submission.csv'
x = pd.read_csv(TRAIN_DATASET_PATH)
y = pd.DataFrame(x['Price'], columns=['Price'])
x_test = pd.read_csv(TEST_DATASET_PATH)
y_test = pd.read_csv(TEST_TARGET_PATH)
y_test = pd.DataFrame(y_test['Price'], columns=['Price'])
x_transform = transform(x)
x_fit = fit(x_transform)
x_data = x_fit[[
    'Rooms',
    'Square',
    'proposal_district',
    'square_one_room',
    'social',
    'Ecology_1',
    'Healthcare_1',
    'Helthcare_2',
]]
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y, test_size=0.2, random_state=42)
x_test_transform = transform(x_test)
x_test_fit = fit(x_test_transform)
x_test_data = x_test_fit[[
    'Rooms',
    'Square',
    'proposal_district',
    'square_one_room',
    'social',
    'Ecology_1',
    'Healthcare_1',
    'Helthcare_2',
]]

In [3]:
def evaluate_preds(train_true_values, train_pred_values, valid_true_values, valid_pred_values):
    """
    Выводить R2
    """
    print("train R2:\t" + str(round(r2(train_true_values, train_pred_values), 3)))
    print("valid R2:\t" + str(round(r2(valid_true_values, valid_pred_values), 3)))

rf_model = RandomForestRegressor(max_depth=8, min_samples_split=100, n_estimators=500, n_jobs=-1, random_state=39)
rf_model.fit(x_train, y_train)
y_train_preds = rf_model.predict(x_train)
y_valid_preds = rf_model.predict(x_valid)
y_test_preds = rf_model.predict(x_test_data)
evaluate_preds(y_train, y_train_preds, y_valid, y_valid_preds)
feature_importances = pd.DataFrame(zip(x_train.columns, rf_model.feature_importances_), 
                                   columns=['feature_name', 'importance'])

feature_importances.sort_values(by='importance', ascending=False)

train R2:	0.709
valid R2:	0.703


Unnamed: 0,feature_name,importance
1,Square,0.529076
2,proposal_district,0.333292
4,social,0.041211
0,Rooms,0.03527
5,Ecology_1,0.020666
3,square_one_room,0.018868
6,Healthcare_1,0.013859
7,Helthcare_2,0.007758


In [4]:
test_id = x_test["Id"]
pred_df = pd.DataFrame()
pred_df["Id"] = test_id
pred_df["Price"] = np.random.randint(0, 1e6, size=test_id.shape[0])
assert pred_df.shape[0] == 5000, f"Real pred-shape = {pred_df.shape[0]}, Expected pred-shape = 5000"

pred_df.to_csv("data/predictions_.csv", index=False)
pred_df = pd.read_csv("data/predictions_.csv")
pred_df.head(n=2)

Unnamed: 0,Id,Price
0,4567,359715
1,5925,24763


Ссылка на файл прогнозов - https://github.com/Mikhail-gb/ds/blob/40eaac0cc9e03869bbbcc87c5ea163b565f67e34/predictions.csv
Не успел до 9 числа загрузить на кагл.