In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
df = pd.read_csv('../data/train.csv')

# Display the first few rows to check
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# Define dataset path
file_path = "../data/train.csv"

# Ensure dataset file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}. Check the file path.")

# Load dataset
df = pd.read_csv(file_path)

# Define target column
target_column = 'SalePrice'
if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in dataset.")

# Drop rows with missing target values (ensuring data integrity)
df = df.dropna(subset=[target_column])

# Split into features (X) and target (y)
X = df.drop(columns=[target_column])
y = df[target_column]

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset shapes (for verification)
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")





Training Data Shape: (1168, 80)
Testing Data Shape: (292, 80)


In [3]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore")  # Suppress unnecessary warnings

# Load dataset
file_path = "../data/train.csv"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}. Check the file path.")

df = pd.read_csv(file_path)

# Ensure 'SalePrice' exists
target_column = 'SalePrice'
if target_column not in df.columns:
    raise ValueError(f"Column '{target_column}' not found. Available columns: {df.columns.tolist()}")

# Handle missing values:
# Fill categorical NaNs with mode and numerical NaNs with median
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in df.select_dtypes(include=['number']).columns:
    df[col] = df[col].fillna(df[col].median())

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Encode categorical features
label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le  # Store encoders for later use if needed

# Ensure data is valid before splitting
if X.empty or y.empty:
    raise ValueError("Dataset is empty after preprocessing. Check data!")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": lgb.LGBMRegressor(n_estimators=100, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "R²": r2}
    print(f"{name} -> MAE: {mae:.4f}, R²: {r2:.4f}")




Linear Regression -> MAE: 21598.3969, R²: 0.8371
Random Forest -> MAE: 17851.9400, R²: 0.8903
XGBoost -> MAE: 17282.3421, R²: 0.9031
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000954 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3364
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 73
[LightGBM] [Info] Start training from score 181441.541952
LightGBM -> MAE: 16525.5343, R²: 0.8913


In [None]:
import matplotlib.pyplot as plt

# Retrieve models from dictionary
random_forest = models["Random Forest"]
xgboost = models["XGBoost"]

# Plot feature importance for Random Forest
plt.figure(figsize=(10, 6))
plt.barh(X.columns, random_forest.feature_importances_)
plt.title("Random Forest Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()

# Feature importance for XGBoost
plt.figure(figsize=(10, 6))
plt.barh(X.columns, xgboost.feature_importances_)
plt.title("XGBoost Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.show()



NameError: name 'random_forest' is not defined

<Figure size 1000x600 with 0 Axes>

In [None]:
from sklearn.model_selection import GridSearchCV

# Set up the hyperparameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Grid search for Random Forest
grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3, scoring='neg_mean_absolute_error')
grid_rf.fit(X_train, y_train)

# Best parameters and score for Random Forest
print(f"Best parameters for Random Forest: {grid_rf.best_params_}")
print(f"Best MAE for Random Forest: {-grid_rf.best_score_}")
