In [None]:
import pandas as pd

df = pd.read_csv('data/king_country_houses.csv')

print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())


In [None]:
import pandas as pd

# Load DF:
df = pd.read_csv(r'data\king_country_houses.csv')

# Handle missing values:
df = df.dropna(subset=['price'])

# Remove duplicate rows:
df = df.drop_duplicates()

# Convert 'date' column to datetime:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%dT%H%M%S')

# Check data:
print(df.info())


In [None]:
# Calculate correlation between numerical features:
correlation_matrix = df.corr()
print(correlation_matrix['price'].sort_values(ascending=False))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter houses by price (650k):
df_filtered = df[df['price'] >= 650000]

# Correlation matrix:
correlation_matrix = df_filtered.corr()

# Visualize correlation matrix with heatmap:
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix[['price']].sort_values(by='price', ascending=False), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation of Features with Price')
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


# 1. Boxplot & Histogram combined in a grid:
def visualize_outliers(df):
    num_features = df.select_dtypes(include=['float64', 'int64']).columns
    num_plots = len(num_features)
    
    # Create subplots for Boxplots and Histograms:
    fig, axes = plt.subplots(num_plots, 2, figsize=(15, 5 * num_plots)) 
    
    for i, feature in enumerate(num_features):
        sns.boxplot(x=df[feature], ax=axes[i, 0])
        axes[i, 0].set_title(f'Boxplot for {feature}')
        
        df[feature].hist(bins=30, edgecolor='black', ax=axes[i, 1])
        axes[i, 1].set_title(f'Histogram for {feature}')
    
    plt.tight_layout()
    plt.show()

# 2. Z-score Method:
def z_score_outliers(df):
    num_features = df.select_dtypes(include=['float64', 'int64']).columns
    outliers = {}
    
    for feature in num_features:
        z_scores = stats.zscore(df[feature].dropna())
        outliers[feature] = np.where(np.abs(z_scores) > 3)[0]  # Indices of outliers
    
    return outliers

# 3. IQR Method:
def iqr_outliers(df):
    num_features = df.select_dtypes(include=['float64', 'int64']).columns
    outliers = {}
    
    for feature in num_features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers[feature] = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)].index.tolist()
    
    return outliers

# Apply functions:
visualize_outliers(df)  # Visualize boxplots and histograms.
z_outliers = z_score_outliers(df)  # Detect outliers using Z-score.
iqr_outliers = iqr_outliers(df)  # Detect outliers using IQR.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Check for non-numeric columns:
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns

# Drop non-numeric columns:
df_cleaned = df.drop(columns=non_numeric_columns)

# Capping outliers using 5th and 95th percentiles:
numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns

# Calculate 5th and 95th percentiles for each numeric column:
lower_limit = df_cleaned[numeric_columns].quantile(0.05)
upper_limit = df_cleaned[numeric_columns].quantile(0.95)

# Apply capping for each column:
for col in numeric_columns:
    df_cleaned[col] = df_cleaned[col].clip(lower=lower_limit[col], upper=upper_limit[col])

# Filter relevant columns for model:
X = df_cleaned.drop("price", axis=1)
y = df_cleaned["price"]

# Split data into training and testing datasets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize & train Random Forest Regressor Model with parallelization:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)  # n_jobs=-1 to use all CPU cores
rf_model.fit(X_train, y_train)

# Make predictions on the test dataset:
predictions = rf_model.predict(X_test)

# Evaluate model's performance:
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
mse_value = mean_squared_error(y_test, predictions)

# Evaluation metrics:
print("R² = ", round(r2, 4))
print("RMSE = ", round(rmse, 4))
print("MAE = ", round(mae, 4))
print("MSE =  ", round(mse_value, 4))


In [None]:
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# Convert the data to DMatrix (XGBoost’s internal data structure)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set the parameters for GPU-based training
params = {
    'objective': 'reg:squarederror',  # Regression task
    'eval_metric': 'rmse',  # Evaluation metric
    'tree_method': 'hist',  # Use histogram-based method
    'device': 'cuda',  # Use GPU (CUDA)
}

# Train the model
bst = xgb.train(params, dtrain, num_boost_round=100)

# Make predictions
predictions = bst.predict(dtest)

# Evaluate the model’s performance
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
mse_value = mean_squared_error(y_test, predictions)

# Evaluation metrics
print("R² = ", round(r2, 4))
print("RMSE = ", round(rmse, 4))
print("MAE = ", round(mae, 4))
print("MSE =  ", round(mse_value, 4))


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import uniform, randint

# Split data into features and target
X = df_cleaned.drop("price", axis=1)
y = df_cleaned["price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. **Hyperparameter Tuning with RandomizedSearchCV** (For XGBoost)
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 0.5),
    'objective': ['reg:squarederror'],
    'eval_metric': ['rmse'],
    'tree_method': ['hist'],
    'device': ['cuda']  # Leverage GPU
}

xgb_model = xgb.XGBRegressor(random_state=42)

# Randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=100, scoring='neg_mean_squared_error', 
                                   cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best hyperparameters found
best_xgb_params = random_search.best_params_
print("Best Hyperparameters for XGBoost: ", best_xgb_params)

# 2. **Cross-Validation** (Evaluate model stability across different subsets)
xgb_cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Cross-validation scores (MSE): ", -xgb_cv_scores)

# 3. **Ensemble Method - Stacking** (Combining multiple models for better predictions)
base_learners = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBRegressor(random_state=42, **best_xgb_params))
]

stacking_model = StackingRegressor(estimators=base_learners, final_estimator=xgb.XGBRegressor(random_state=42))
stacking_model.fit(X_train, y_train)

# 4. **Early Stopping**
# The model has built-in support for early stopping during training.
# Early stopping will stop training after 50 rounds of no improvement in validation loss.
xgb_dtrain = xgb.DMatrix(X_train, label=y_train)
xgb_dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'max_depth': best_xgb_params['max_depth'],
    'learning_rate': best_xgb_params['learning_rate'],
    'subsample': best_xgb_params['subsample'],
    'colsample_bytree': best_xgb_params['colsample_bytree'],
    'min_child_weight': best_xgb_params['min_child_weight'],
    'gamma': best_xgb_params['gamma'],
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'device': 'cuda'
}

watchlist = [(xgb_dtrain, 'train'), (xgb_dtest, 'test')]

xgb_model_with_early_stopping = xgb.train(params, xgb_dtrain, num_boost_round=1000, early_stopping_rounds=50, evals=watchlist)

# Make predictions with early stopping model
predictions = xgb_model_with_early_stopping.predict(xgb_dtest)

# 5. **Evaluate the final optimized model (Stacking + XGBoost with early stopping)**
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
mae = mean_absolute_error(y_test, predictions)
mse_value = mean_squared_error(y_test, predictions)

# Evaluation metrics
print("R² = ", round(r2, 4))
print("RMSE = ", round(rmse, 4))
print("MAE = ", round(mae, 4))
print("MSE = ", round(mse_value, 4))
