In [68]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
import scipy
import sklearn
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

In [69]:
pd.options.display.max_columns = 100

In [70]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [71]:
target_name = 'SalePrice'
target = train_df[target_name]

После EDA избавимся от всех переменных которые имели малый вес в использованной модели LGBM

In [72]:
float_columns = [
    'LotFrontage', 'LotArea', 'OverallQual', 'MasVnrArea',
     'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
      '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotRmsAbvGrd',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
        'ScreenPorch', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold'
]

categorical_columns = ['MSSubClass',
 'LotShape',
 'Neighborhood',
 'Condition1',
 'OverallCond',
 'Exterior1st',
 'Exterior2nd',
 'ExterCond',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'PavedDrive',
 'SaleCondition', 
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'Fireplaces',
 'GarageType',
 'CentralAir']

In [73]:
df = pd.concat((train_df, test_df), axis = 0)
df = df[float_columns+categorical_columns +['SalePrice', 'Id']]

In [74]:
def target_encoding(n = 9):
    categ_nunique = df[categorical_columns].nunique()
    target_encoding = categ_nunique[categ_nunique>n].index
    for col in target_encoding:
        feature_name = col + "Rank"
        df.loc[:, feature_name] = df[col].map(df.groupby(col).SalePrice.median())
        df.loc[:, feature_name] = df.loc[:, feature_name].rank(method = "dense")
        df.drop(columns = col, inplace = True)
        categorical_columns.remove(col)
    return [col + 'Rank' for col in target_encoding]
target_encoding_cols = target_encoding(9)

In [75]:
df[categorical_columns] = df[categorical_columns].apply(LabelEncoder().fit_transform)

scaler = RobustScaler()

df[float_columns] = scaler.fit_transform(df[float_columns])
df[float_columns] = scaler.transform(df[float_columns])

In [76]:
train_df = df[~df['SalePrice'].isna()]
test_df = df[df['SalePrice'].isna()].drop(columns = 'SalePrice')

In [77]:
def model(features, test_features, n_folds = 4):
    
    """
    
    Return
    --------
        submission (pd.DataFrame): 
            dataframe with `SK_ID_CURR` and `TARGET` probabilities
            predicted by the model.
        feature_importances (pd.DataFrame): 
            dataframe with the feature importances from the model.
        valid_metrics (pd.DataFrame): 
            dataframe with training and validation metrics (ROC AUC) for each fold and overall.
        
    """
    
    labels = np.array(features[target_name])
    features = features.drop(columns = [target_name])
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    feature_names = list(features.columns)
    
    features = np.array(features)
    test_features = np.array(test_features)
    
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 50)
    
    feature_importance_values = np.zeros(len(feature_names))
    
    test_predictions = np.zeros(test_features.shape[0])
    
    out_of_fold = np.zeros(features.shape[0])
    
    valid_scores = []
    train_scores = []
        
    for train_indices, valid_indices in k_fold.split(features):
        
        train_features, train_labels = features[train_indices], labels[train_indices]
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
        
        model = lgb.LGBMRegressor(n_estimators=10000, objective = 'regression', learning_rate = 0.05, 
                                   reg_alpha = 0.1, reg_lambda = 0.1, 
                                   subsample = 0.8, n_jobs = -1, random_state = 41)
        
        model.fit(train_features, train_labels, eval_metric = 'rmse',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 1000, verbose = 200)
        
        best_iteration = model.best_iteration_
        
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        
        test_predictions += model.predict(test_features, num_iteration = best_iteration)/ k_fold.n_splits
        
        out_of_fold[valid_indices] = model.predict(valid_features, num_iteration = best_iteration)
    
        valid_score = model.best_score_['valid']['rmse']
        train_score = model.best_score_['train']['rmse']
        
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        
    submission = pd.DataFrame({target_name: test_predictions})
    
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    
    valid_rmse = mean_squared_error(labels, out_of_fold)
    
    valid_scores.append(valid_rmse)
    train_scores.append(np.mean(train_scores))
    
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    return submission, feature_importances, metrics

In [78]:
feature_columns = categorical_columns   +target_encoding_cols + float_columns
submission, fi, metrics = model(train_df[feature_columns + ['SalePrice']], test_df[feature_columns])

Training Data Shape:  (1460, 46)
Testing Data Shape:  (1459, 46)
Training until validation scores don't improve for 1000 rounds
[200]	train's rmse: 12508.9	train's l2: 1.56472e+08	valid's rmse: 23824.7	valid's l2: 5.67614e+08
[400]	train's rmse: 7588.89	train's l2: 5.75912e+07	valid's rmse: 24692.7	valid's l2: 6.09728e+08
[600]	train's rmse: 4891.82	train's l2: 2.39299e+07	valid's rmse: 25306.8	valid's l2: 6.40434e+08
[800]	train's rmse: 3172.96	train's l2: 1.00677e+07	valid's rmse: 25507.5	valid's l2: 6.50634e+08
[1000]	train's rmse: 2006.49	train's l2: 4.02599e+06	valid's rmse: 25582.1	valid's l2: 6.54442e+08
Early stopping, best iteration is:
[105]	train's rmse: 17529.7	train's l2: 3.0729e+08	valid's rmse: 23083.3	valid's l2: 5.32838e+08
Training until validation scores don't improve for 1000 rounds
[200]	train's rmse: 10999.6	train's l2: 1.20991e+08	valid's rmse: 30926.4	valid's l2: 9.56445e+08
[400]	train's rmse: 6475.07	train's l2: 4.19265e+07	valid's rmse: 31070.6	valid's l2: 9.

In [79]:
fi, metrics 

(             feature  importance
 0           LotShape      319.75
 1         Condition1      653.25
 2        OverallCond     1043.25
 3          ExterCond      290.25
 4           BsmtQual      306.75
 5       BsmtExposure      467.25
 6          HeatingQC      470.50
 7        KitchenQual      411.00
 8        FireplaceQu      749.00
 9       GarageFinish      431.25
 10        PavedDrive      175.75
 11     SaleCondition      437.00
 12      BsmtFullBath      189.25
 13      BsmtHalfBath      141.00
 14          FullBath      168.00
 15          HalfBath       91.50
 16      BedroomAbvGr      398.75
 17        Fireplaces       93.75
 18        GarageType      299.50
 19        CentralAir       41.00
 20    MSSubClassRank      544.25
 21  NeighborhoodRank     1885.50
 22   Exterior1stRank      904.75
 23   Exterior2ndRank      798.75
 24       LotFrontage     6010.75
 25           LotArea     5509.75
 26       OverallQual     1417.00
 27        MasVnrArea     4929.50
 28        Bsm

In [80]:
test_df

Unnamed: 0,LotFrontage,LotArea,OverallQual,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,TotRmsAbvGrd,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,YearBuilt,YearRemodAdd,GarageYrBlt,MoSold,YrSold,LotShape,Condition1,OverallCond,ExterCond,BsmtQual,BsmtExposure,HeatingQC,KitchenQual,FireplaceQu,GarageFinish,PavedDrive,SaleCondition,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,Fireplaces,GarageType,CentralAir,Id,MSSubClassRank,NeighborhoodRank,Exterior1stRank,Exterior2ndRank
0,-3.210884,-2.309988,-3.25,0.000000,-0.502543,144.0,-0.798184,-1.944423,-2.116058,0.000000,-2.339899,-3.25,-1.871185,0.004960,-0.376735,0.0,120.0,-41.542161,-51.123603,-47.129252,-1.5000,-1003.5,3,1,5,4,3,3,4,3,5,2,2,4,0,0,1,0,2,0,1,1,1461,11.0,10.0,12.0,14.0
1,-3.208617,-2.309830,-3.00,0.004015,-0.501696,0.0,-0.797787,-1.942697,-2.114403,0.000000,-2.338763,-3.00,-1.877563,0.013924,-0.369388,0.0,0.0,-41.543490,-51.125575,-47.130952,-1.5000,-1003.5,0,2,5,4,3,3,4,2,5,2,2,4,0,0,1,1,3,0,1,1,1462,11.0,10.0,6.0,3.0
2,-3.224490,-2.309856,-3.25,0.000000,-0.501942,0.0,-0.798572,-1.944245,-2.115936,0.001414,-2.337976,-3.00,-1.874969,0.007511,-0.369796,0.0,0.0,-41.526205,-51.099277,-47.108844,-1.6875,-1003.5,0,2,4,4,2,3,2,3,4,0,2,4,0,0,2,1,3,1,1,1,1463,15.0,14.0,12.0,14.0
3,-3.215420,-2.310086,-3.00,0.000744,-0.502294,0.0,-0.798026,-1.944253,-2.115943,0.001368,-2.338042,-2.75,-1.875153,0.012755,-0.369388,0.0,0.0,-41.525762,-51.099277,-47.108277,-1.5000,-1003.5,0,2,5,4,3,3,0,2,2,0,2,4,0,0,2,1,3,1,1,1,1464,15.0,14.0,12.0,14.0
4,-3.294785,-2.310383,-2.50,0.000000,-0.502925,0.0,-0.796004,-1.942887,-2.114590,0.000000,-2.338892,-3.25,-1.874603,0.000000,-0.360000,0.0,144.0,-41.528421,-51.103222,-47.111678,-1.8125,-1003.5,0,2,4,4,2,3,0,2,5,1,2,4,0,0,2,0,2,0,1,1,1465,14.0,23.0,9.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-3.344671,-2.310566,-3.50,0.000000,-0.503414,0.0,-0.797378,-1.945720,-2.117396,0.001102,-2.339385,-3.25,-1.882324,0.000000,-0.376735,0.0,0.0,-41.538172,-51.117686,,-1.5000,-1004.5,3,2,6,4,3,3,2,3,5,3,2,4,0,0,1,1,3,0,6,1,2915,9.0,1.0,13.0,15.0
1455,-3.344671,-2.310569,-3.50,0.000000,-0.502945,0.0,-0.798114,-1.945720,-2.117396,0.001102,-2.339385,-3.00,-1.877960,0.000000,-0.371837,0.0,0.0,-41.538172,-51.117686,-47.124150,-1.6250,-1004.5,3,2,4,4,3,3,4,3,5,2,2,0,0,0,1,1,3,0,4,1,2916,9.0,1.0,13.0,15.0
1456,-3.029478,-2.309487,-3.25,0.000000,-0.501136,0.0,-0.798971,-1.943103,-2.114804,0.000000,-2.339039,-2.75,-1.873535,0.016794,-0.376735,0.0,0.0,-41.542604,-51.100592,-47.129819,-1.3125,-1004.5,3,2,6,4,3,3,0,3,4,2,2,0,1,0,1,0,4,1,5,1,2917,11.0,12.0,12.0,14.0
1457,-3.251701,-2.310058,-3.25,0.000000,-0.502787,0.0,-0.797294,-1.944307,-2.115775,0.000000,-2.339705,-3.00,-1.882324,0.002834,-0.370204,0.0,0.0,-41.528421,-51.103222,,-1.4375,-1004.5,3,2,4,4,2,0,4,3,5,3,2,4,0,1,1,0,3,0,6,1,2918,7.0,12.0,9.0,4.0


In [81]:
test_df['SalePrice'] =submission

In [82]:
test_df.to_csv("submission.csv", columns=["Id", "SalePrice"], index=False)