In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train['MSSubClass'] = train['MSSubClass'].astype('category').cat.codes
test['MSSubClass'] = test['MSSubClass'].astype('category').cat.codes

train['OverallQual'] = train['OverallQual'].astype('category').cat.codes
test['OverallQual'] = test['OverallQual'].astype('category').cat.codes

train['OverallCond'] = train['OverallCond'].astype('category').cat.codes
test['OverallCond'] = test['OverallCond'].astype('category').cat.codes

train.drop(['Id'], axis=1, inplace=True)

In [4]:
train['LotFrontage'] = train['LotFrontage'].fillna(train['LotFrontage'].mean())
train['Alley'] = train['Alley'].fillna('None')
train['MasVnrType'] = train['MasVnrType'].fillna('None')
train['MasVnrArea'] = train['MasVnrArea'].fillna(train['MasVnrArea'].mean())
train['BsmtQual'] = train['BsmtQual'].fillna('None')
train['BsmtCond'] = train['BsmtCond'].fillna('None')
train['BsmtExposure'] = train['BsmtExposure'].fillna('None')
train['BsmtFinType1'] = train['BsmtFinType1'].fillna('None')
train['BsmtFinType2'] = train['BsmtFinType2'].fillna('None')
train['Electrical'] = train['Electrical'].fillna('SBrkr')
train['FireplaceQu'] = train['FireplaceQu'].fillna('None')
train['GarageType'] = train['GarageType'].fillna('None')
train['GarageFinish'] = train['GarageFinish'].fillna('None')
train['GarageQual'] = train['GarageQual'].fillna('None')
train['GarageCond'] = train['GarageCond'].fillna('None')
train['PoolQC'] = train['PoolQC'].fillna('None')
train['Fence'] = train['Fence'].fillna('None')
train['MiscFeature'] = train['MiscFeature'].fillna('None')
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train['GarageYrBlt'].mean())

In [5]:
test['MSZoning'] = test['MSZoning'].fillna('RL')
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].mean())
test['Alley'] = test['Alley'].fillna('None')
test['Utilities'] = test['Utilities'].fillna('AllPub')
test['Exterior1st'] = test['Exterior1st'].fillna('VinylSd')
test['Exterior2nd'] = test['Exterior2nd'].fillna('VinylSd')
test['MasVnrType'] = test['MasVnrType'].fillna('None')
test['MasVnrArea'] = test['MasVnrArea'].fillna(test['MasVnrArea'].mean())
test['BsmtQual'] = test['BsmtQual'].fillna('None')
test['BsmtCond'] = test['BsmtCond'].fillna('None')
test['BsmtExposure'] = test['BsmtExposure'].fillna('None')
test['BsmtFinType1'] = test['BsmtFinType1'].fillna('None')
test['BsmtFinType2'] = test['BsmtFinType2'].fillna('None')
test['BsmtFinSF1'] = test['BsmtFinSF1'].fillna(test['BsmtFinSF1'].mean())
test['BsmtFinSF2'] = test['BsmtFinSF2'].fillna(test['BsmtFinSF2'].mean())
test['BsmtUnfSF'] = test['BsmtUnfSF'].fillna(test['BsmtUnfSF'].mean())
test['TotalBsmtSF'] = test['TotalBsmtSF'].fillna(test['TotalBsmtSF'].mean())
test['BsmtFullBath'] = test['BsmtFullBath'].fillna(test['BsmtFullBath'].mean())
test['BsmtHalfBath'] = test['BsmtHalfBath'].fillna(test['BsmtHalfBath'].mean())
test['KitchenQual'] = test['KitchenQual'].fillna('TA')
test['Functional'] = test['Functional'].fillna('Typ')
test['FireplaceQu'] = test['FireplaceQu'].fillna('None')
test['GarageType'] = test['GarageType'].fillna('None')
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean())
test['GarageFinish'] = test['GarageFinish'].fillna('None')
test['GarageCars'] = test['GarageCars'].fillna(test['GarageCars'].mean())
test['GarageArea'] = test['GarageArea'].fillna(test['GarageArea'].mean())
test['GarageQual'] = test['GarageQual'].fillna('None')
test['GarageCond'] = test['GarageCond'].fillna('None')
test['PoolQC'] = test['PoolQC'].fillna('None')
test['Fence'] = test['Fence'].fillna('None')
test['MiscFeature'] = test['MiscFeature'].fillna('None')
test['SaleType'] = test['SaleType'].fillna('Oth')

In [6]:
columns = list(train.columns.values)

objcols = []
for column in columns:
    if train[column].dtype == 'int64' or train[column].dtype == 'float64':
        objcols.append(column)
        
objcols.remove('SalePrice')

In [7]:
for objcol in objcols:
    train[objcol] = (train[objcol] - train[objcol].mean())/train[objcol].std()
    test[objcol] = (test[objcol] - test[objcol].mean())/test[objcol].std()

In [8]:
columns = list(train.columns.values)

objcols = []
for i in columns:
    if train[i].dtype == 'object':
        objcols.append(i)

In [9]:
for objcol in objcols:
    train[objcol] = train[objcol].astype('category').cat.codes
    test[objcol] = test[objcol].astype('category').cat.codes

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int8   
 1   MSZoning       1460 non-null   int8   
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   float64
 4   Street         1460 non-null   int8   
 5   Alley          1460 non-null   int8   
 6   LotShape       1460 non-null   int8   
 7   LandContour    1460 non-null   int8   
 8   Utilities      1460 non-null   int8   
 9   LotConfig      1460 non-null   int8   
 10  LandSlope      1460 non-null   int8   
 11  Neighborhood   1460 non-null   int8   
 12  Condition1     1460 non-null   int8   
 13  Condition2     1460 non-null   int8   
 14  BldgType       1460 non-null   int8   
 15  HouseStyle     1460 non-null   int8   
 16  OverallQual    1460 non-null   int8   
 17  OverallCond    1460 non-null   int8   
 18  YearBuil

In [12]:
sorted(train.corr())

['1stFlrSF',
 '2ndFlrSF',
 '3SsnPorch',
 'Alley',
 'BedroomAbvGr',
 'BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtQual',
 'BsmtUnfSF',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'EnclosedPorch',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Fireplaces',
 'Foundation',
 'FullBath',
 'Functional',
 'GarageArea',
 'GarageCars',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'GarageYrBlt',
 'GrLivArea',
 'HalfBath',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenAbvGr',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotArea',
 'LotConfig',
 'LotFrontage',
 'LotShape',
 'LowQualFinSF',
 'MSSubClass',
 'MSZoning',
 'MasVnrArea',
 'MasVnrType',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'Neighborhood',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'PavedDrive',
 'PoolArea',
 'PoolQC',
 'RoofMatl',
 'RoofStyle',
 'Sa

In [13]:
train.corr()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
MSSubClass,1.000000,0.037785,-0.308758,-0.132505,-0.025737,0.073545,0.090920,-0.007658,-2.621737e-02,0.060625,...,0.020837,-0.015024,0.074567,-0.045944,-0.006062,0.007052,-0.022100,0.020195,-0.033097,-0.054751
MSZoning,0.037785,1.000000,-0.106363,-0.034452,0.087654,-0.368086,0.061887,-0.017854,-1.192034e-03,-0.009895,...,-0.003128,0.002882,-0.061289,-0.005553,0.009293,-0.031496,-0.020628,0.097437,0.009494,-0.166872
LotFrontage,-0.308758,-0.106363,1.000000,0.306795,-0.037323,-0.065017,-0.144931,-0.075647,7.085092e-18,-0.181253,...,0.180868,-0.191630,-0.036751,0.009213,0.001168,0.010158,0.006768,-0.030846,0.058464,0.334901
LotArea,-0.132505,-0.034452,0.306795,1.000000,-0.197131,-0.029676,-0.165315,-0.149083,1.012318e-02,-0.121161,...,0.077672,-0.065167,0.036031,0.106135,0.038068,0.001205,-0.014261,0.012292,0.034169,0.263843
Street,-0.025737,0.087654,-0.037323,-0.197131,1.000000,-0.001587,-0.010224,0.115995,1.681767e-03,0.013960,...,0.004413,-0.004067,-0.029386,-0.160748,-0.022733,0.003690,-0.025043,0.014339,0.006064,0.041036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MoSold,0.007052,-0.031496,0.010158,0.001205,0.003690,-0.011274,-0.033455,-0.011599,-5.155166e-02,0.018902,...,-0.033737,0.036368,-0.009892,0.013957,-0.006495,1.000000,-0.145721,-0.047386,0.013320,0.046432
YrSold,-0.022100,-0.020628,0.006768,-0.014261,-0.025043,0.015176,0.036449,0.020507,2.335254e-02,-0.005992,...,-0.059689,0.060904,-0.035366,0.057708,0.004906,-0.145721,1.000000,-0.002327,0.003880,-0.028923
SaleType,0.020195,0.097437,-0.030846,0.012292,0.014339,0.001092,-0.000911,-0.025754,-1.267697e-01,0.014325,...,0.011009,-0.013592,-0.006154,-0.003340,0.015773,-0.047386,-0.002327,1.000000,0.184067,-0.054911
SaleCondition,-0.033097,0.009494,0.058464,0.034169,0.006064,-0.000162,-0.038118,0.033809,-8.970068e-02,0.051579,...,-0.067251,0.097425,0.100675,-0.008074,0.013027,0.013320,0.003880,0.184067,1.000000,0.213092


In [10]:
X_train = train.drop(['SalePrice'], axis=1).copy()
y_train = train['SalePrice']
X_test = test.drop(['Id'], axis=1).copy()

X_train.shape, y_train.shape, X_test.shape

((1460, 79), (1460,), (1459, 79))

In [509]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_preds = logreg.predict(X_test)
 
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


95.62

In [11]:
random_forest = RandomForestClassifier(n_estimators=300)
random_forest.fit(X_train, y_train)
randomforest_pred = random_forest.predict(X_test)

acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest

100.0

In [512]:
submission_1 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": randomforest_pred
    })
submission_1.to_csv("./submission_1.csv",index=False)

In [515]:
submission_2 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": logreg_preds
    })
submission_2.to_csv("./submission_2.csv",index=False)

In [521]:
n_folds = 10
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

rmse_cv(logreg).mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

51641.78154924248

In [15]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
tree_reg_pred = tree_reg.predict(X_test)

acc_tree_reg = round(tree_reg.score(X_train ,y_train)*100, 2)
acc_tree_reg

100.0

In [17]:
submission_3 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": tree_reg_pred
    })
submission_3.to_csv("./submission_3.csv",index=False) # 현재까지는 베스트 0.20468

In [18]:
scores = cross_val_score(tree_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [20]:
print(scores)
print(scores.mean())
print(scores.std())

[-1.43690217e+09 -1.42743386e+09 -1.07878094e+09 -2.65682519e+09
 -1.95520945e+09 -1.20307455e+09 -9.33691732e+08 -1.54843917e+09
 -4.09523806e+09 -1.43782974e+09]
-1777342485.3924656
900737816.6723251


In [22]:
random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(X_train, y_train)
randomforest_reg_pred = random_forest_reg.predict(X_test)

acc_random_forest_reg = round(random_forest_reg.score(X_train, y_train) * 100, 2)
acc_random_forest_reg

98.17

In [27]:
'''RandomForestRegressor(
    n_estimators=100,
    *,
    criterion='mse',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=None,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None,
)
'''

"RandomForestRegressor(\n    n_estimators=100,\n    *,\n    criterion='mse',\n    max_depth=None,\n    min_samples_split=2,\n    min_samples_leaf=1,\n    min_weight_fraction_leaf=0.0,\n    max_features='auto',\n    max_leaf_nodes=None,\n    min_impurity_decrease=0.0,\n    min_impurity_split=None,\n    bootstrap=True,\n    oob_score=False,\n    n_jobs=None,\n    random_state=None,\n    verbose=0,\n    warm_start=False,\n    ccp_alpha=0.0,\n    max_samples=None,\n)\n"

In [24]:
submission_4 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": randomforest_reg_pred
    }) 
submission_4.to_csv("./submission_4.csv",index=False) # 현재까지는 베스트 0.14782

In [32]:
param_grid = [
    {'n_estimators':[500,1000,4000], 'max_features':[8,10,12,14,16]},
    {'bootstrap' :[False], 'n_estimators':[3, 10], 'max_features':[2,3,4]}, ]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [8, 10, 12, 14, 16],
                          'n_estimators': [500, 1000, 4000]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [33]:
grid_search.best_params_

{'max_features': 12, 'n_estimators': 1000}

In [34]:
grid_search.best_estimator_

RandomForestRegressor(max_features=12, n_estimators=1000)

In [35]:
random_forest_reg1 = RandomForestRegressor(n_estimators=1000, max_features=12)
random_forest_reg1.fit(X_train, y_train)
randomforest_reg_pred1 = random_forest_reg1.predict(X_test)

acc_random_forest_reg1 = round(random_forest_reg1.score(X_train, y_train) * 100, 2)
acc_random_forest_reg1

98.27

In [36]:
submission_5 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": randomforest_reg_pred1
    }) 
submission_5.to_csv("./submission_5.csv",index=False) # 더 낮다. 0.15002

In [37]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [38]:
feature_importances

array([4.00406390e-03, 2.36045730e-03, 1.05989297e-02, 2.10141519e-02,
       8.15372309e-05, 3.99681364e-04, 1.67549561e-03, 2.28847231e-03,
       4.84383387e-06, 1.57804080e-03, 1.38254958e-03, 8.90318618e-03,
       1.09339724e-03, 3.42381502e-04, 1.32781350e-03, 2.80615355e-03,
       1.37270257e-01, 4.38744432e-03, 4.72389684e-02, 1.91655866e-02,
       2.49700674e-03, 1.42247391e-03, 3.33934956e-03, 3.20845355e-03,
       1.65962449e-03, 1.32909860e-02, 5.28666664e-02, 8.01245277e-04,
       5.53978263e-03, 3.43718307e-02, 7.07665872e-04, 3.00258492e-03,
       2.92412666e-03, 2.79684218e-02, 8.38567293e-04, 1.03185476e-03,
       6.90086579e-03, 6.25060429e-02, 2.39319725e-04, 3.44177543e-03,
       1.92515107e-03, 3.54879935e-04, 5.35300629e-02, 2.85882547e-02,
       3.06563478e-04, 1.02212738e-01, 2.79251848e-03, 8.79132726e-04,
       2.72277284e-02, 4.09865908e-03, 4.72887766e-03, 1.13914905e-03,
       2.49479674e-02, 1.79483950e-02, 8.02858476e-04, 1.66753711e-02,
      

In [49]:
X_train.columns.values

array(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'PavedDriv

In [50]:
sorted(zip(feature_importances, X_train.columns.values)) '''Utilities, Street을 제거하고 다시해보자'''

[(4.843833871028809e-06, 'Utilities'),
 (8.153723089578332e-05, 'Street'),
 (0.00015343325906248104, 'MiscFeature'),
 (0.00016529002742901716, 'MiscVal'),
 (0.0002393197249154982, 'Heating'),
 (0.00025704981189632163, '3SsnPorch'),
 (0.00030656347751180764, 'LowQualFinSF'),
 (0.0003423815023374908, 'Condition2'),
 (0.000354879934783689, 'Electrical'),
 (0.00039968136447492776, 'Alley'),
 (0.0007076658717561177, 'BsmtCond'),
 (0.0007401499365546386, 'PavedDrive'),
 (0.0008012452767877638, 'ExterCond'),
 (0.0008028584761381572, 'Functional'),
 (0.000838567292961371, 'BsmtFinType2'),
 (0.0008791327264762778, 'BsmtHalfBath'),
 (0.0010318547568995032, 'BsmtFinSF2'),
 (0.0010491746899716952, 'Fence'),
 (0.0010586998920382724, 'EnclosedPorch'),
 (0.001093397241793793, 'Condition1'),
 (0.0011391490549772264, 'KitchenAbvGr'),
 (0.0012358927996078428, 'PoolArea'),
 (0.0012726727079994222, 'GarageCond'),
 (0.0013278134958872624, 'BldgType'),
 (0.0013735612901789851, 'GarageQual'),
 (0.00138254958

In [67]:
X_train_new = X_train.drop(['Utilities', 'Street'], axis=1)
X_test_new = X_test.drop(['Utilities', 'Street'], axis=1)

In [69]:
X_test_new

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Alley,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0,2,0.555397,0.363804,1,3,3,4,0,12,...,1.818336,-0.057207,2,2,1,-0.092212,-0.038268,1.713318,8,4
1,0,3,0.604032,0.897553,1,0,3,0,0,12,...,-0.301440,-0.057207,2,4,0,19.723675,-0.038268,1.713318,8,4
2,5,3,0.263586,0.809368,1,0,3,4,0,8,...,-0.301440,-0.057207,2,2,1,-0.092212,-1.140224,1.713318,8,4
3,5,3,0.458126,0.032053,1,0,3,4,0,8,...,-0.301440,-0.057207,2,4,1,-0.092212,-0.038268,1.713318,8,4
4,11,3,-1.244106,-0.971475,1,0,1,4,0,22,...,2.242291,-0.057207,2,4,1,-0.092212,-1.874861,1.713318,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,13,4,-2.314082,-1.590785,1,3,3,4,0,10,...,-0.301440,-0.057207,2,4,1,-0.092212,-0.038268,-1.359492,8,4
1455,13,4,-2.314082,-1.599260,1,3,3,4,0,10,...,-0.301440,-0.057207,2,4,1,-0.092212,-0.772905,-1.359492,8,0
1456,0,3,4.446215,2.054445,1,3,3,4,0,11,...,-0.301440,-0.057207,2,4,1,-0.092212,1.063688,-1.359492,8,0
1457,9,3,-0.320037,0.125484,1,3,3,4,0,11,...,-0.301440,-0.057207,2,2,3,1.017478,0.329051,-1.359492,8,4


In [71]:
X_train_new.shape, y_train.shape, X_test_new.shape

((1460, 77), (1460,), (1459, 77))

In [73]:
random_forest_reg_2 = RandomForestRegressor()
random_forest_reg_2.fit(X_train_new, y_train)
randomforest_reg_2_pred = random_forest_reg_2.predict(X_test_new)

acc_random_forest_reg_2 = round(random_forest_reg_2.score(X_train_new, y_train) * 100, 2)
acc_random_forest_reg_2

98.13

In [74]:
submission_6 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": randomforest_reg_2_pred
    }) 
submission_6.to_csv("./submission_6.csv",index=False) # 향상되지 않음 0.14805

In [75]:
param_grid = [
    {'n_estimators':[10,100,1000], 'max_features':[8,10,12,14,16]},
    {'bootstrap' :[False], 'n_estimators':[3, 10], 'max_features':[2,3,4]}, ]

forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train_new, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [8, 10, 12, 14, 16],
                          'n_estimators': [10, 100, 1000]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [76]:
grid_search.best_params_

{'max_features': 14, 'n_estimators': 100}

In [77]:
random_forest_reg_2 = RandomForestRegressor(max_features=14, n_estimators=100)
random_forest_reg_2.fit(X_train_new, y_train)
randomforest_reg_2_pred = random_forest_reg_2.predict(X_test_new)

acc_random_forest_reg_2 = round(random_forest_reg_2.score(X_train_new, y_train) * 100, 2)
acc_random_forest_reg_2

98.1

In [79]:
submission_7 = pd.DataFrame({
        "Id": test["Id"],
        "SalePrice": randomforest_reg_2_pred
    }) 
submission_7.to_csv("./submission_7.csv",index=False) # 향상되지 않음 0.15046