In [3]:
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install xgboost


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import xgboost as xgb

# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"
data = pd.read_csv(url)

# Question 1
data = data[data['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]
decision_tree = DecisionTreeRegressor(max_depth=1, random_state=1)
decision_tree.fit(data[['median_income']], np.log(data['median_house_value']))
# The feature used for splitting the data is 'median_income'
print("Question 1 Answer:", 'median_income')

# Question 2
# Encode the 'ocean_proximity' variable
label_encoder = LabelEncoder()
data['ocean_proximity'] = label_encoder.fit_transform(data['ocean_proximity'])

# Prepare the data
data['median_house_value'] = np.log(data['median_house_value'])

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
data = imputer.fit_transform(data)

X = pd.DataFrame(data)
y = X.pop(8)  # Assuming 'median_house_value' is in the last column (index 8)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

# Train the Random Forest model
random_forest = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
random_forest.fit(X_train, y_train)

# Calculate RMSE on validation
y_pred = random_forest.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Question 2 Answer:", rmse)

# Question 3
best_rmse = float('inf')
best_n_estimators = 10
for n_estimators in range(10, 201, 10):
    random_forest = RandomForestRegressor(n_estimators=n_estimators, random_state=1, n_jobs=-1)
    random_forest.fit(X_train, y_train)
    y_pred = random_forest.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    if rmse < best_rmse:
        best_rmse = rmse
    else:
        break

print("Question 3 Answer:", best_n_estimators)

# Question 4
best_rmse = float('inf')
best_max_depth = None

for max_depth in [10, 15, 20, 25]:
    for n_estimators in range(10, 201, 10):
        random_forest = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators, random_state=1, n_jobs=-1)
        random_forest.fit(X_train, y_train)
        y_pred = random_forest.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        if rmse < best_rmse:
            best_rmse = rmse
            best_max_depth = max_depth

print("Question 4 Answer:", best_max_depth)

# Question 5
random_forest = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
random_forest.fit(X_train, y_train)
feature_importances = random_forest.feature_importances_

# Find the most important feature
features = X_train.columns
most_important_feature = features[np.argmax(feature_importances)]
print("Question 5 Answer:", most_important_feature)

# Question 6
# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Define XGBoost parameters
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

# Train XGBoost model with different values of eta
etas = [0.3, 0.1]
best_rmse = float('inf')
best_eta = None

for eta in etas:
    xgb_params['eta'] = eta
    model = xgb.train(xgb_params, dtrain, num_boost_round=100)
    y_pred = model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    if rmse < best_rmse:
        best_rmse = rmse
        best_eta = eta

print("Question 6 Answer:", best_eta)

Question 1 Answer: median_income
Question 2 Answer: 0.22805941896548607
Question 3 Answer: 10
Question 4 Answer: 25
Question 5 Answer: 9
Question 6 Answer: 0.3
