In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import pickle
import matplotlib.pyplot as plt

In [21]:
def load_processed_data():
    df = pd.read_csv('processed_data.csv')
    X = df.drop('resale_price', axis=1)
    y = df['resale_price']
    return X, y

In [22]:
def scale_features(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Save the scaler
    with open('scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    return X_scaled, scaler


In [23]:
def train_model(X_train, y_train):
    # Simplified XGBoost parameters
    model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        tree_method='exact',  # Use exact tree method
        enable_categorical=False,  # Disable categorical feature support
        nthread=-1  # Use all CPU cores
    )

    # Train the model
    model.fit(X_train, y_train)

    # Save the model immediately after training
    try:
        with open('model.pkl', 'wb') as f:
            pickle.dump(model, f, protocol=4)
        print("Model saved successfully")
    except Exception as e:
        print(f"Error saving model: {e}")

    return model

In [24]:
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    print(f"Mean Squared Error: {mse:,.2f}")
    print(f"Root Mean Squared Error: {rmse:,.2f}")
    print(f"Mean Absolute Error: {mae:,.2f}")
    print(f"R² Score: {r2:.4f}")

    return mse, rmse, mae, r2

In [27]:
if __name__ == "__main__":
    # Load and prepare data
    print("Loading data...")
    X, y = load_processed_data()

    print("Scaling features...")
    X_scaled, scaler = scale_features(X)

    # Split the data
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

    # Train and save model
    print("Training model...")
    model = train_model(X_train, y_train)

    # Evaluate model
    print("\nEvaluating model...")
    metrics = evaluate_model(model, X_test, y_test)

    print("\nProcess completed!")

Loading data...
Scaling features...
Splitting data...
Training model...
Model saved successfully

Evaluating model...
Mean Squared Error: 15,564,963.44
Root Mean Squared Error: 3,945.25
Mean Absolute Error: 2,897.43
R² Score: 0.9994

Process completed!


In [26]:
# Alternative saving method
model_booster = model.get_booster()
model_booster.save_model('model.json')