In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, Normalizer

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
def is_numeric_column(column):
    try:
        pd.to_numeric(column, errors='raise')
        return True
    except ValueError:
        return False

def prepare_data_new(data):
    data['LotFrontage'] = data['LotFrontage'].fillna(data['LotFrontage'].mean())
    data.drop(['Alley'], axis=1,inplace=True)
    data.drop(['FireplaceQu'], axis=1,inplace=True)
    data.drop(['PoolQC'], axis=1,inplace=True)
    data.drop(['Fence'], axis=1,inplace=True)
    data.drop(['MiscFeature'], axis=1,inplace=True)
    for column in data.columns:
        if data[column].isnull().sum()>0:
            data[column] = data[column].fillna(data[column].mode()[0])
        data_column = data[[column]].values
        if not is_numeric_column(data[column]):
            label = LabelEncoder()
            label.fit(data[column])
            data[column]=label.transform(data[column])
        elif column!='SalePrice':
            scaler = StandardScaler()
            data_column =  scaler.fit_transform(data_column)
            data[column] = data_column
            
def plot_standart(real,pred):
    plt.plot(real, label='Real')
    plt.plot(pred, label='Prediction')
    plt.legend()
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.title('Real vs Prediction')
    plt.show()
    
def plot_sorted(real,pred):
    real_sorted=sorted(real)
    pred_sorted=sorted(pred)
    plt.plot(real_sorted, label='Real')
    plt.plot(pred_sorted, label='Prediction')
    plt.legend()
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.title('Real vs Prediction')
    plt.show()
    
def hist_prediction(real,pred):
    bin_count=100
    plt.hist(pred, bins=bin_count, edgecolor='black')
    plt.hist(real, bins=bin_count, edgecolor='black')
    plt.xlabel('Index')
    plt.ylabel('Count')
    plt.title('Распределение')

In [5]:
data.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [None]:
params = ['1stFlrSF','2ndFlrSF','MasVnrArea']
for param in params:
    class_price = data.groupby(param)[['SalePrice']].mean()
    param_values = class_price.index.values
    saleprice_values = class_price['SalePrice'].values
    plt.plot(param_values, saleprice_values, 'o-')
    
    # Добавьте название осей
    plt.xlabel(param)
    plt.ylabel('SalePrice')
    
    # Показатьграфик
    plt.show()

In [None]:
#data_map = prepare_data_auto(data)
prepare_data_new(data)
data.isnull().sum()

In [None]:
data.head(20)

In [None]:
plt.hist(data['SalePrice'], bins=100, edgecolor='black')

# Добавьте название осей
plt.xlabel('SalePrice')
plt.ylabel('Count')

# Добавьте заголовок графика
plt.title('Распределение SalePrice')

In [None]:
train_target = data['SalePrice']
drop_params = ['Id']
#drop_params = data.drop(['OverallQual','OverallCond', 'SalePrice','ExterQual', 'ExterCond','SaleCondition','SaleType','BsmtQual','BsmtCond','1stFlrSF','2ndFlrSF','MasVnrArea','LotArea'], axis=1)
train_data = data.drop(drop_params, axis=1)
train_data = train_data.drop('SalePrice', axis=1)

In [None]:
#model = LGBMRegressor(random_state=10, objective='mae', learning_rate=0.247332, n_estimators=300, num_leaves=30, max_depth=12, feature_fraction = 0.89952, min_data_in_leaf = 10)
model=RandomForestRegressor(max_depth=None,min_samples_split=2, n_estimators=200)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],  # число деревьев
    'max_depth': [None, 5, 10],      # максимальная глубина деревьев
    'min_samples_split': [2, 5, 10]  # минимальное число объектов для разделения узла
}

grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=5)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_target, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
test_pred = model.predict(X_test)
metric = metrics.mean_absolute_percentage_error(y_test, test_pred)
metric

In [None]:
plot_standart(y_test.to_list(),test_pred)
plot_sorted(y_test,test_pred)
hist_prediction(y_test,test_pred)

In [None]:
#print("Лучшие параметры: ", grid_search.best_params_)
#print("Лучшее MSE: ", -grid_search.best_score_)