In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

#importing Classifier
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

#F Score and other metrics
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Exploring the Dataset

In [None]:
housing = pd.read_csv("/kaggle/input/playground-series-s3e1/train.csv")
X_test = pd.read_csv("/kaggle/input/playground-series-s3e1/test.csv")
housing.head(10)

In [None]:
X_test.head(10)

In [None]:
housing.info()

In [None]:
X_test.info()

In [None]:
housing.describe()

In [None]:
X_test.describe()

In [None]:
#Checking for duplicate rows in the dataset

duplicate_rows = housing[housing.duplicated()]
print(duplicate_rows)

In [None]:
duplicate_rows2 = X_test[X_test.duplicated()]
print(duplicate_rows2)

In [None]:
# Counting the number of missing values in each column

missing_val = housing.isnull().sum()
print(missing_val)

In [None]:
missing_val2 = X_test.isnull().sum()
print(missing_val2)

In [None]:
# Dropping the Id column
housing = housing.drop(columns=["id"])
housing.head()

# Visualising the dataset

In [None]:
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
housing.plot(kind="scatter", x="Longitude", y="Latitude", alpha=0.4,
s=housing["Population"]/100, label="Population", figsize=(10,7),
c="MedHouseVal", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()

In [None]:
X_test.plot(kind="scatter", x="Longitude", y="Latitude", alpha=0.4,
s=X_test["Population"]/100, label="Population", figsize=(10,7), cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()

In [None]:
corr = housing.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot = True)

In [None]:
corr["MedHouseVal"].sort_values(ascending=False)

In [None]:
# Select a subset of the columns to plot
columns = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'MedHouseVal']
df_subset = housing[columns]
pd.plotting.scatter_matrix(df_subset, figsize=(20, 20))
plt.show()

In [None]:
housing.plot(kind="scatter", x="MedInc", y="MedHouseVal", alpha=0.1, figsize=(8,8))

In [None]:
housing.count()

# Spliting the dataset into training and validation set

In [None]:
# Selecting the desired columns for the model
X = housing[['MedInc', 'Latitude', 'Longitude', 'AveOccup', 'AveRooms', 'HouseAge', 'AveBedrms', 'Population']]
y = housing['MedHouseVal']

# Split the dataset into a train and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and validation sets
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# Training and evaluating the model

In [None]:
# Selecting CatBoostRegressor
cbr = CatBoostRegressor(random_state=1 ,verbose=False)

cbr.fit(X_train, y_train)
y_pred = cbr.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'Root mean squared error: {rmse:.2f}')


# Hyperparameter tuning

In [None]:
# Define the hyperparameter grid
param_grid = {
    'iterations': [1000, 1500, 2000],
    'learning_rate': [0.03, 0.1, 0.5],
    'depth': [ 4, 5, 6],
    'verbose': [False]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=cbr, param_grid=param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

In [None]:
cbr_best = CatBoostRegressor(depth=6, iterations=2000, learning_rate=0.03, verbose=False)
cbr_best.fit(X_train, y_train)
y_pred = cbr_best.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'Root mean squared error: {rmse:.2f}')

# Predicting on test set and generating output

In [None]:
X2 = X_test[['MedInc', 'Latitude', 'Longitude', 'AveOccup', 'AveRooms', 'HouseAge', 'AveBedrms', 'Population' ]]
X2.head()

In [None]:
y_test = cbr_best.predict(X2)
y_test

In [None]:
output = pd.DataFrame({'id':X_test.id, 'MedHouseVal':y_test})
output.to_csv('Housing_price.csv',index=False)
print("Your submission was successfully saved")