In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
train_dataset=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_dataset=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

In [None]:

train_nans=train_dataset.isna().sum().sum()
test_nans=train_dataset.isna().sum().sum()
print(f'Train NaNs={train_nans}')
print(f'test NaNs={test_nans}')

In [None]:
y_train=train_dataset['SalePrice']
x_train=train_dataset.drop(columns=['SalePrice'],axis=1)




In [None]:
numeric_feats = x_train.dtypes[x_train.dtypes != "object"].index
categoric_feats = x_train.dtypes[x_train.dtypes == "object"].index

numerical_na_cols = []
categorical_na_cols = []

for k,v in x_train[numeric_feats].isnull().sum().to_dict().items():
  if v != 0:
    numerical_na_cols.append(k)

for k,v in x_train[categoric_feats].isnull().sum().to_dict().items():
  if v != 0:
    categorical_na_cols.append(k)

print(numerical_na_cols)
print(categorical_na_cols)

In [None]:
numeric_feats2 = test_dataset.dtypes[test_dataset.dtypes != "object"].index
categoric_feats2 = test_dataset.dtypes[test_dataset.dtypes == "object"].index

numerical_na_cols2 = []
categorical_na_cols2 = []

for k,v in test_dataset[numeric_feats2].isnull().sum().to_dict().items():
  if v != 0:
    numerical_na_cols2.append(k)

for k,v in test_dataset[categoric_feats2].isnull().sum().to_dict().items():
  if v != 0:
   categorical_na_cols2.append(k)


In [None]:
my_imputer = SimpleImputer(strategy='mean')

x_train_imput= my_imputer.fit_transform(x_train[numerical_na_cols])
x_test_imput = my_imputer.fit_transform(test_dataset[numerical_na_cols2])

x_train[numerical_na_cols] = x_train_imput
test_dataset[numerical_na_cols2] = x_test_imput

In [None]:
x_train['GarageYrBlt'] = round(x_train['GarageYrBlt'])
test_dataset['GarageYrBlt'] = round(test_dataset['GarageYrBlt'])

x_train['Exterior2nd'] = x_train['Exterior2nd'].replace({'Brk Cmn': 'BrkComm'})
test_dataset['Exterior2nd'] = test_dataset['Exterior2nd'].replace({'Brk Cmn': 'BrkComm'})



In [None]:
values = {"Alley": 'No Alley Access', "PoolQC": "No Pool", "Fence": "No Fence", "MiscFeature": "None"}

x_train.fillna(value=values, inplace=True)
test_dataset.fillna(value=values, inplace=True)

In [None]:
drop_cols = [col for col in x_train.columns if x_train[col].isnull().any()]


x_train = x_train.drop(columns=drop_cols)
test_dataset = test_dataset.drop(columns=drop_cols)



print(test_dataset.shape)
print(x_train.shape)


In [None]:

y_train_norm = y_train.values.reshape(-1, 1)
scaler=MinMaxScaler()
scaler.fit_transform(y_train_norm)

In [None]:
print('Train columns with missing data:')

for k,v in x_train.isnull().sum().to_dict().items():
  if v != 0:
    print(f"{k}:{v}")
  else:
    continue

print('---------------------------------')
print('Test columns with missing data')

for k,v in test_dataset.isnull().sum().to_dict().items():
  if v != 0:
    print(f"{k}:{v}")
  else:
    continue


In [None]:

test_missing_cols = []
for k,v in test_dataset.isnull().sum().to_dict().items():
  if v != 0:
    print(f"{k}:{v}")
    test_missing_cols.append(k)
  else:
    continue

In [None]:
cat_test_missing  = [col for col in test_missing_cols if test_dataset[col].dtypes == 'object']
my_imputer_cat = SimpleImputer(strategy='most_frequent')

test_imputed_values = my_imputer_cat.fit_transform(test_dataset[cat_test_missing])

test_dataset[cat_test_missing] = test_imputed_values

In [None]:
categorical_cols = [col for col in x_train.columns if x_train[col].dtypes == 'object']
categorical_cols2 = [col for col in test_dataset.columns if test_dataset[col].dtypes == 'object']



In [None]:
print(test_dataset.shape)
print(x_train.shape)

In [None]:
#ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), categorical_cols)], remainder='passthrough')
#x_train=ct.fit_transform(x_train)
#test_dataset=ct.fit_transform(test_dataset)

print(categorical_cols)
print(categorical_cols2)

In [None]:

for col in categorical_cols:
 le=LabelEncoder()
 le.fit(x_train[col].unique().tolist())
 x_train[col]=le.transform(x_train[col])

for col in categorical_cols2:
 le.fit(test_dataset[col].unique().tolist())
 test_dataset[col]=le.transform(test_dataset[col])

In [None]:
print(test_dataset.shape)
print(x_train.shape)

In [None]:
#regressors=[('RF',RandomForestRegressor(n_estimators=1000)),('SVR',SVR(kernel='rbf')),('LR',LinearRegression()),]
#regressor =XGBRegressor(n_estimators=1500, learning_rate=0.02, max_depth=6, subsample=0.7)


In [None]:
regressor=StackingRegressor(                     # Stacked Regressor
                      regressors=(CatBoostRegressor(),
                                  LinearRegression(),
                                  BayesianRidge(),
                                  GradientBoostingRegressor()),
                                  meta_regressor = CatBoostRegressor(),
                                  use_features_in_secondary = True)

In [None]:
regressor.fit(x_train,y_train)

In [None]:
y_pred=regressor.predict(test_dataset)

In [None]:
y_pred = pd.DataFrame(y_pred,columns=['SalePrice'])
sub = pd.concat([test_dataset.Id.astype(str),y_pred],axis=1)
sub.set_index('Id',inplace=True)
sub.to_csv(f"submission.csv")
