# Data Proccessing


<ul>
  <p style="color:black; font-size:36px;">Goals</p>
  <li><p style="color:red; font-size:18px;">We possess a dataset with details on sale prices.</p></li>
  <li><p style="color:orange; font-size:18px;">Our Goals is to create a model that forecasts sale prices for new data.</p></li>
  <li><p style="color:purple; font-size:18px;">This task involves examining the current data to uncover patterns and relationships that can guide our predictions.</p></li>
  <li><p style="color:green; font-size:18px;">By training our model on this dataset, we strive to improve its accuracy and dependability for future sales forecasts.</p></li>
  <li><p style="color:blue; font-size:18px;">Leveraging various features from the dataset, we intend to build a strong predictive tool to aid in making informed pricing decisions.</p></li>
</ul>


## Import library

In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load data set


In [None]:
train_file_path = "../input/house-prices-advanced-regression-techniques/train.csv"
dataset = pd.read_csv(train_file_path)

## Check what's train data set look like ?

In [None]:
print("Train dataset shape is {}".format(dataset.shape))
dataset.head(5)

remove column "Id" from the dataset

In [None]:
dataset = dataset.drop('Id', axis=1)
dataset.head(5)

In [None]:
plt.figure(figsize=(9, 8))  
sns.histplot(dataset['SalePrice'].dropna(), color='purple', bins=100, alpha=0.4)  # Drop NaN values for plotting  
plt.title('Distribution of Sale Prices')  
plt.xlabel('Sale Price')  
plt.ylabel('Frequency')  
plt.show()

<p style="font-size:18px;">*** &#9733;&#9733;&#9733;In the world of real estate, every home has a story to tell, and every house price narrates a unique tale of its own. Let’s embark on a journey together to explore the distribution of house prices, uncovering how each price reflects the lives, dreams, and efforts of the people who call these houses their homes. &#9733;&#9733;&#9733; *** &#127769;</p>


In [None]:
from IPython.display import display, HTML
html = '<p style="color:red; font-size:18px;">**Data description**</p>'
display(HTML(html))
print( dataset['SalePrice'].describe())

<p style="font-size:18px;">Let's dive into exploring the distribution of our numerical features.
    To do this, we'll start by listing all the data types in our dataset and then cherry-pick the numerical ones.  </P>

## Disturbution of our numical features


In [None]:
list(set(dataset.dtypes.tolist()))

In [None]:
df_num = dataset.select_dtypes(include = ['float64', 'int64'])
df_num.head(5)

#  Plot the Heat map to see correlation between each variables

In [None]:

numeric_dataset = dataset.select_dtypes(include=[np.number])  # Select only numeric columns
corrmat = numeric_dataset.corr()  # Compute the correlation matrix

k = 10  # Number of variables for the heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index  # Get the top 'k' correlations with 'SalePrice'
cm = np.corrcoef(numeric_dataset[cols].values.T)  # Compute the correlation coefficients for the selected columns

sns.set(font_scale=1.25)
plt.figure(figsize=(10, 8))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels=cols.values, xticklabels=cols.values)
plt.title('Correlation Matrix Heatmap')
plt.show()

# So Let's plot the Histrogram to see relation between each variables and frequency

In [None]:
colors = ['orange', 'purple', 'gray', 'red', 'blue']
colors = colors * ((len(df_num.columns) // len(colors)) + 1)  # Repeat colors

# Number of subplots
num_features = len(df_num.columns)
rows = (num_features + 1) // 2  # Calculate the number of rows for the layout

# Create figure and subplots
fig, axes = plt.subplots(nrows=rows, ncols=2, figsize=(16, rows * 5))
axes = axes.flatten()  # Flatten the subplot array for easy access

# Plot histograms with different colors
for i, (column, ax) in enumerate(zip(df_num.columns, axes)):
    df_num[column].hist(ax=ax, bins=50, color=colors[i], alpha=0.7)
    ax.set_title(column)
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Remove extra axes if the number of features is less than the number of subplots
for j in range(num_features, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

# Lets try some models

 # 1.Random Forest

In [None]:
def split_datasett(datase, test_ratio=0.3):
  test_indices = np.random.rand(len(datase)) < test_ratio
  return datase[~test_indices], datase[test_indices]

train_ds_pd, valid_ds_pd = split_datasett(dataset)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

In [None]:
label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

In [None]:
tfdf.keras.get_all_models()

In [None]:
rf = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
rf.compile(metrics=["mse"])

In [None]:
rf.fit(x=train_ds)

In [None]:
import matplotlib.pyplot as plt
logs = rf.make_inspector().training_logs()
plt.plot([log.num_trees for log in logs], [log.evaluation.rmse for log in logs])
plt.xlabel("Number of trees")
plt.ylabel("RMSE (out-of-bag)")
plt.show()

# Let's see how much a feature contributes to the model predictions

In [None]:
inspector = rf.make_inspector()
inspector.evaluation()
evaluation = rf.evaluate(x=valid_ds,return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [None]:
print(f"Available variable importances:")
for importance in inspector.variable_importances().keys():
  print("\t", importance)

In [None]:
inspector.variable_importances()["NUM_AS_ROOT"]
plt.figure(figsize=(12, 4))

# Mean decrease in AUC of the class 1 vs the others.
variable_importance_metric = "NUM_AS_ROOT"
variable_importances = inspector.variable_importances()[variable_importance_metric]

# Extract the feature name and importance values.

# `variable_importances` is a list of <feature, importance> tuples.
feature_names = [vi[0].name for vi in variable_importances]
feature_importances = [vi[1] for vi in variable_importances]
# The feature are ordered in decreasing importance value.
feature_ranks = range(len(feature_names))

bar = plt.barh(feature_ranks, feature_importances, label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks, feature_names)
plt.gca().invert_yaxis()

# TODO: Replace with "plt.bar_label()" when available.
# Label each bar with values
for importance, patch in zip(feature_importances, bar.patches):
  plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"{importance:.4f}", va="top")

plt.xlabel(variable_importance_metric)
plt.title("NUM AS ROOT of the class 1 vs the others")
plt.tight_layout()
plt.show()

# Random Forest predict

In [None]:
test_file_path = "../input/house-prices-advanced-regression-techniques/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data,
    task = tfdf.keras.Task.REGRESSION)

preds = rf.predict(test_ds)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()

In [None]:
sample_submission_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = rf.predict(test_ds)
sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.head()