In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
# Import the training and testing data

train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
itest = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

# Separate dependent and independent variables

X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)

In [3]:
def get_numerical(df):
    
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

    numerical_df = pd.DataFrame()
    for i in numerical_features:
        numerical_df[i] = df[i]
    return numerical_df

X_train = get_numerical(X_train)
test = get_numerical(itest)

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [5]:
drop_list = ['MSSubClass', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'MoSold']

X_train.drop(drop_list, axis=1, inplace=True)
test.drop(drop_list, axis=1, inplace=True)

In [6]:
# Handling 'missing' data

for i in X_train:
    X_train[i].fillna(0, inplace=True)
    test[i].fillna(0, inplace=True)

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   LotFrontage    1459 non-null   float64
 2   LotArea        1459 non-null   int64  
 3   OverallQual    1459 non-null   int64  
 4   OverallCond    1459 non-null   int64  
 5   YearBuilt      1459 non-null   int64  
 6   YearRemodAdd   1459 non-null   int64  
 7   MasVnrArea     1459 non-null   float64
 8   TotalBsmtSF    1459 non-null   float64
 9   1stFlrSF       1459 non-null   int64  
 10  2ndFlrSF       1459 non-null   int64  
 11  LowQualFinSF   1459 non-null   int64  
 12  GrLivArea      1459 non-null   int64  
 13  BsmtFullBath   1459 non-null   float64
 14  BsmtHalfBath   1459 non-null   float64
 15  FullBath       1459 non-null   int64  
 16  HalfBath       1459 non-null   int64  
 17  BedroomAbvGr   1459 non-null   int64  
 18  KitchenA

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
test = scaler.transform(test)

In [9]:
from sklearn.decomposition import KernelPCA

kpca = KernelPCA(n_components = 6, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
test = kpca.transform(test)

In [10]:
test.shape

(1459, 6)

In [11]:
explained_variance = np.var(X_train, axis=0)
explained_variance_ratio = explained_variance / np.sum(explained_variance)
print(explained_variance_ratio)
print(np.cumsum(explained_variance_ratio))

[0.33112059 0.22310399 0.18493216 0.09655764 0.08528745 0.07899817]
[0.33112059 0.55422457 0.73915673 0.83571438 0.92100183 1.        ]


In [12]:
"""from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)"""

'from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)'

In [13]:
from xgboost import XGBRegressor

regressor = XGBRegressor()
regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [14]:
from sklearn.metrics import accuracy_score


y_pred = regressor.predict(X_train)


"""predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test.values[:, 0], predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))"""

'predictions = [round(value) for value in y_pred]\n# evaluate predictions\naccuracy = accuracy_score(y_test.values[:, 0], predictions)\nprint("Accuracy: %.2f%%" % (accuracy * 100.0))'

In [15]:
test

array([[ 0.3134728 ,  0.04080175, -0.0722223 ,  0.00134294,  0.06674517,
        -0.03932772],
       [-0.02488274, -0.38763212, -0.09070261,  0.12765423, -0.05706915,
         0.02538209],
       [-0.1638187 ,  0.17134803,  0.28777561,  0.04831689,  0.09189091,
         0.00864172],
       ...,
       [-0.01714351, -0.24947814, -0.11287854,  0.01841256,  0.01530754,
         0.14958643],
       [ 0.04036815, -0.35529018, -0.07922728,  0.13976762, -0.13326523,
         0.08449599],
       [-0.21661201, -0.18240067,  0.10968573, -0.00501448, -0.09574609,
         0.22765088]])

In [16]:
y_pred = pd.DataFrame(y_pred)

In [17]:
y_pred

Unnamed: 0,0
0,215432.906250
1,177154.281250
2,219153.171875
3,137605.953125
4,251079.921875
...,...
1455,174324.937500
1456,210581.609375
1457,253208.437500
1458,140974.968750


In [18]:
y_pred['Id'] = [i for i in range(1461, 2921)]

In [19]:
y_pred.set_index('Id', inplace=True)

In [20]:
y_pred.rename(columns={0: 'SalePrice'}, inplace=True)

In [21]:
y_pred.to_csv('predictions.csv')