<a href="https://colab.research.google.com/github/JoelWekesa/Data-Analysis/blob/main/housingpricespca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, BaggingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from google.colab import drive

In [2]:
pd.set_option("display.max_rows", 200)

In [3]:
drive.mount("/content/datasets")

Drive already mounted at /content/datasets; to attempt to forcibly remount, call drive.mount("/content/datasets", force_remount=True).


In [4]:
path = "/content/datasets/MyDrive/datasets/house-prices-advanced-regression-techniques/train.csv"

In [5]:
df = pd.read_csv(path)

In [6]:
to_drop = []
for i in df.columns:
  if df[i].isna().sum() > 500:
    to_drop.append(i)

In [7]:
df = df.drop(columns = to_drop)

In [8]:
encoder = LabelEncoder()
imputer = SimpleImputer()
scaler = StandardScaler()
pca = PCA(n_components=6)

In [9]:
for i in df.columns:
  if df[i].dtype != "int" and df[i].dtype != "float":
    df[i] = df[i].fillna(df[i].value_counts().idxmax())
    df[i] = encoder.fit_transform(df[i])

In [10]:
imputer = SimpleImputer()

In [11]:
X = df.drop(columns = ["SalePrice"])
X = imputer.fit_transform(X)

In [12]:
y = df["SalePrice"]

In [13]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.1, random_state=42)

In [14]:
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)
train_X = pca.fit_transform(train_X)
test_X = pca.transform(test_X)

In [15]:
rfc = RandomForestRegressor()

In [16]:
rfc.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [17]:
rfc_pred = rfc.predict(test_X)

In [18]:
r2_score(test_y, rfc_pred)

0.899845536129981

In [19]:
boost = GradientBoostingRegressor()

In [20]:
boost.fit(train_X, train_y)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [21]:
boost_pred = boost.predict(test_X)

In [22]:
r2_score(test_y, boost_pred)

0.8926335367879823

In [23]:
xboost = XGBRegressor()

In [24]:
xboost.fit(train_X, train_y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [25]:
xboost_pred = xboost.predict(test_X)

In [26]:
r2_score(test_y, xboost_pred)

0.8991741336469591

In [27]:
estimators = [("rfc", rfc), ("boost", boost)]

In [28]:
stacked = StackingRegressor(estimators=estimators, final_estimator=xboost)

In [29]:
stacked.fit(train_X, train_y)



StackingRegressor(cv=None,
                  estimators=[('rfc',
                               RandomForestRegressor(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     criterion='mse',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                    

In [30]:
stack_pred = stacked.predict(test_X)

In [31]:
r2_score(test_y, stack_pred)

0.8794637991136866

In [32]:
estims = [("rfc", rfc), ("boost", boost), ("xboost", xboost)]

In [33]:
voted = VotingRegressor(estimators=estims)

In [34]:
voted.fit(train_X, train_y)



VotingRegressor(estimators=[('rfc',
                             RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                                    

In [35]:
voted_pred = voted.predict(test_X)

In [36]:
r2_score(test_y, voted_pred)

0.8979014734091381

In [37]:
path2 = "/content/datasets/MyDrive/datasets/house-prices-advanced-regression-techniques/test.csv"

In [38]:
tdf = pd.read_csv(path2)

In [39]:
to_drop = []
for i in tdf.columns:
  if tdf[i].isna().sum() > 500:
    to_drop.append(i)

In [40]:
tdf = tdf.drop(columns = to_drop)

In [41]:
for i in tdf.columns:
  if tdf[i].dtype != "int" and tdf[i].dtype != "float":
    tdf[i] = tdf[i].fillna(tdf[i].value_counts().idxmax())
    tdf[i] = encoder.fit_transform(tdf[i])

In [42]:
tdf = imputer.fit_transform(tdf)

In [43]:
tdf = scaler.transform(tdf)

In [44]:
tdf = pca.transform(tdf)

In [45]:
predictions = voted.predict(tdf)

In [46]:
predictions

array([126795.21018301, 158390.63978601, 186341.20453251, ...,
       178452.42177623, 129511.06359205, 249047.38791488])

In [47]:
kaggle = pd.read_csv(path2)[["Id"]]

In [48]:
kaggle.insert(1, "SalePrice", predictions)

In [49]:
kaggle.to_csv("predictions.csv", index=False)