In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
class OptionDataGenerator:
    def __init__(self, n_samples=10000, seed=42):
        self.n_samples = n_samples
        np.random.seed(seed)
        
    def _generate_parameters(self):
        """Generate random parameters for option pricing"""
        return {
            'S0': np.random.uniform(80, 120, self.n_samples),
            'K': np.random.uniform(80, 120, self.n_samples),
            'B': np.random.uniform(100, 140, self.n_samples),
            'T': np.random.uniform(0.5, 2.0, self.n_samples),
            'r': np.random.uniform(0.01, 0.05, self.n_samples),
            'sigma': np.random.uniform(0.1, 0.4, self.n_samples)
        }
    
    def _monte_carlo_price(self, S0, K, B, T, r, sigma, n_paths=1000, n_steps=252):
        """Calculate single option price using Monte Carlo"""
        dt = T / n_steps
        drift = (r - 0.5 * sigma * sigma) * dt
        diffusion = sigma * np.sqrt(dt)
        
        Z = np.random.normal(0, 1, (n_paths, n_steps))
        paths = np.zeros((n_paths, n_steps + 1))
        paths[:, 0] = S0
        
        for t in range(1, n_steps + 1):
            paths[:, t] = paths[:, t-1] * np.exp(drift + diffusion * Z[:, t-1])
            
        max_prices = np.max(paths, axis=1)
        final_prices = paths[:, -1]
        payoffs = np.where(max_prices < B, np.maximum(final_prices - K, 0), 0)
        
        return np.mean(payoffs) * np.exp(-r * T)
    
    def generate_dataset(self):
        """Generate complete dataset with parameters and prices"""
        params = self._generate_parameters()
        prices = []
        
        print("Generating training data...")
        for i in range(self.n_samples):
            if i % 1000 == 0:
                print(f"Progress: {i}/{self.n_samples}")
                
            price = self._monte_carlo_price(
                params['S0'][i], params['K'][i], params['B'][i],
                params['T'][i], params['r'][i], params['sigma'][i]
            )
            prices.append(price)
        
        # Create DataFrame
        df = pd.DataFrame({
            'S0': params['S0'],
            'K': params['K'],
            'B': params['B'],
            'T': params['T'],
            'r': params['r'],
            'sigma': params['sigma'],
            'price': prices
        })
        
        return df
    
    def save_dataset(self, filename='option_data.csv'):
        """Generate and save dataset to CSV"""
        df = self.generate_dataset()
        df.to_csv(filename, index=False)
        print(f"Dataset saved to {filename}")

In [4]:
generator = OptionDataGenerator(n_samples=1000)  # Small sample for testing
generator.save_dataset('/Users/zhoupeng/Desktop/CF_coursework/data/options_price_data.csv')

Generating training data...
Progress: 0/1000
Dataset saved to /Users/zhoupeng/Desktop/CF_coursework/data/options_price_data.csv


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [7]:
class OptionModelTrainer:
    def __init__(self, data_path):
        self.data_path = data_path
        self.model = None
        self.scaler = StandardScaler()
        
    def load_and_prepare_data(self):
        """Load data and prepare for training"""
        df = pd.read_csv(self.data_path)
        
        # Split features and target
        X = df[['S0', 'K', 'B', 'T', 'r', 'sigma']]
        y = df['price']
        
        # Split train/test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Scale features
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        return X_train_scaled, X_test_scaled, y_train, y_test
    
    def train_model(self, n_estimators=100):
        """Train the Random Forest model"""
        print("Loading and preparing data...")
        X_train, X_test, y_train, y_test = self.load_and_prepare_data()
        
        print("Training model...")
        self.model = RandomForestRegressor(
            n_estimators=n_estimators,
            random_state=42,
            n_jobs=-1  # Use all CPU cores
        )
        self.model.fit(X_train, y_train)
        
        # Evaluate model
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)
        
        y_pred = self.model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        
        print(f"\nModel Performance:")
        print(f"Training R² score: {train_score:.4f}")
        print(f"Testing R² score: {test_score:.4f}")
        print(f"Mean Squared Error: {mse:.4f}")
        
        # Plot actual vs predicted
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, alpha=0.5)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        plt.xlabel('Actual Price')
        plt.ylabel('Predicted Price')
        plt.title('Actual vs Predicted Option Prices')
        plt.tight_layout()
        plt.savefig('model_performance.png')
        plt.close()
        
        return train_score, test_score
    
    def save_model(self, model_path='/Users/zhoupeng/Desktop/CF_coursework/data/option_model.joblib', scaler_path='/Users/zhoupeng/Desktop/CF_coursework/data/scaler.joblib'):
        """Save the trained model and scaler"""
        if self.model is None:
            raise ValueError("Model has not been trained yet!")
            
        joblib.dump(self.model, model_path)
        joblib.dump(self.scaler, scaler_path)
        print(f"Model saved to {model_path}")
        print(f"Scaler saved to {scaler_path}")

In [8]:
trainer = OptionModelTrainer('/Users/zhoupeng/Desktop/CF_coursework/data/options_price_data.csv')
trainer.train_model()
trainer.save_model()

Loading and preparing data...
Training model...

Model Performance:
Training R² score: 0.9794
Testing R² score: 0.8093
Mean Squared Error: 1.7460
Model saved to /Users/zhoupeng/Desktop/CF_coursework/data/option_model.joblib
Scaler saved to /Users/zhoupeng/Desktop/CF_coursework/data/scaler.joblib
