<a href="https://www.kaggle.com/code/misaelcribeiro/tpot-score-advanced-house-prediction?scriptVersionId=102058764" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<h3>How Good Is TPOT For Advanced House Prediction competition? Let's see!</h3>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

**Loading our data...**

In [None]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv').set_index('Id')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv').set_index('Id')

**Checking basic info...**

In [None]:
train_df.info()

In [None]:
pd.options.display.min_rows = 115
print(train_df.isnull().sum().sort_values(ascending=False))
print(test_df.isnull().sum().sort_values(ascending=False))

<h3>Let's impute missing data</h3>

In [None]:
list_null_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage', 'GarageYrBlt', 'GarageCond',
                     'GarageType', 'GarageFinish', 'GarageQual', 'BsmtFinType1', 'MasVnrArea', 'MasVnrType', 'Electrical']

list_to_replace = list(train_df[list_null_features].select_dtypes(exclude='object').columns)

print(list_to_replace)

for i in list_to_replace:
    train_df[i].fillna((train_df[i].mean()), inplace=True)
    test_df[i].fillna((train_df[i].mean()), inplace=True)
    print(f'Replacing {i}')

<h3>Ordinal Encoder</h3>

In [None]:
from sklearn.preprocessing import OrdinalEncoder

object_columns = list(train_df.select_dtypes(include=['object']).columns)

ord_encoder = OrdinalEncoder()

for column in object_columns:
    train_df[column] = ord_encoder.fit_transform(train_df[[column]])
    test_df[column] = ord_encoder.fit_transform(test_df[[column]])

In [None]:
train_df[object_columns] = train_df[object_columns].fillna(0)
test_df[object_columns] = test_df[object_columns].fillna(0)

<h3>Spliting our data</h3>

In [None]:
X = train_df.copy()
y = X.pop('SalePrice')

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

<h3>Let's set an earlier stop of 7. If in 7 generation the model doesn't improve, TPOT will stop running. This will take a lot of time to run!</h3>

In [None]:
from tpot import TPOTRegressor

pipeline_optimizer  = TPOTRegressor(generations=100, population_size=100,
                         offspring_size=None, mutation_rate=0.9,
                         crossover_rate=0.1,
                         scoring='neg_mean_squared_error', cv=5,
                         subsample=1.0, n_jobs=-1,
                         max_time_mins=None, max_eval_time_mins=5,
                         random_state=None, config_dict=None,
                         template=None,
                         warm_start=True,
                         memory=None,
                         use_dask=False,
                         periodic_checkpoint_folder=None,
                         early_stop=7,
                         verbosity=2,
                         disable_update_check=False)


pipeline_optimizer.fit(X_train, y_train)
print(pipeline_optimizer.score(X_test, y_test))

<h3>Exporting the file to see the optimized pipeline that TPOT made for us</h3>

In [None]:
pipeline_optimizer.export('tpot.txt')

<h3>Here is the TPOT Optimized pipeline for our dataset. Let's run this model and submit to see how it goes.</h3>

In [None]:
#This code is already "cleaned". TPOT export is a little bit different 

import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor


# Average CV score on the training set was: -689066004.1415527
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=LinearSVR(C=0.1, dual=True, epsilon=0.01, loss="squared_epsilon_insensitive", tol=1e-05)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.6000000000000001, tol=0.1)),
    XGBRegressor(learning_rate=0.1, max_depth=6, min_child_weight=2, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=0.9000000000000001, verbosity=0)
)

exported_pipeline.fit(X, y)
results = exported_pipeline.predict(test_df)

<h3>Exporting the submission!!</h3>

In [None]:
test_df = test_df.rename(columns={"target":"SalePrice"})

Final = pd.DataFrame(columns=["Id","SalePrice"])
Final["Id"] = test_df.index
Final["SalePrice"] = results
Final["Id"] = Final["Id"].astype(int)
Final.set_index('Id', inplace=True)
Final.to_csv('Final_TPOT.csv')

<h3>We scored 0.12741 (834 position) at House Prices - Advanced Regression Techniques competition! It is a great start!</h3>