## 1. Importing Libraries

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

## 2. Loading the dataset

In [None]:
data = pd.read_csv("C:/Users/Win Technology/Downloads/archive (1)/melb_data.csv")
data.head()

## 3. Selecting the label (Y)

In [None]:
y = data["Price"]

## 4. Selecting features (X)

In [None]:
features = ["Rooms", "Bathroom", "Landsize", "BuildingArea", "YearBuilt"]
X = data[features]

## 5. Split Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=1
)


## 6. Create the model

In [None]:
model = DecisionTreeRegressor(random_state = 1)

## 7. Train (fit) the model

In [None]:
model.fit(X_train, y_train)

## 8. Make predictions

In [None]:
predictions = model.predict(X_test)

## 9. Check accuracy

In [None]:
mae = mean_absolute_error(y_test, predictions)
print("Mean Absolute Error:", mae)

## 11. Intermediate Machine Learning 
## 11.1 Checking missing values

In [None]:
data.isnull().sum()

In [None]:
# It seems that BuildingArea,YearBuilt,  have too many missing values 
data = data.drop(["BuildingArea", "YearBuilt"], axis = 1)

## 11.2 imputing other missing values 

In [None]:
from sklearn.impute import SimpleImputer
# numeric imputer
num_imputer = SimpleImputer(strategy='median')
data["Car"] = num_imputer.fit_transform(data[["Car"]]).ravel()
# categorical imputer
cat_imputer = SimpleImputer(strategy='most_frequent')
data["CouncilArea"] = cat_imputer.fit_transform(data[["CouncilArea"]]).ravel()

## 11.3 Bringing Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_preds)
# Evaluating the model
print("Random Forest Mean Absolute Error:", rf_mae)

## Comparing mae with rf_mae

In [None]:
print("Decision Tree MAE:", mae)
print("Random Forest MAE:", rf_mae)

## Visualization

In [None]:
import matplotlib.pyplot as plt

# Make predictions
preds = model.predict(X_test)

# Plot
plt.scatter(y_test, preds)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted")
plt.show()


## Optional : Fixing overfitting

In [None]:
model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
model.fit(X_train, y_train)

preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mae