In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np

# Load the dataset
file_path = '/content/sample_data/FinancialMarket.csv'
data = pd.read_csv(file_path)

# Separate features (X) and target variable (y)
X = data[['x']].values  # Feature (independent variable)
y = data['combined_data'].values  # Target (dependent variable)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Simple Linear Regression Model
# Initialize and train a simple linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Make predictions for training and testing sets
y_pred_linear_train = linear_model.predict(X_train)
y_pred_linear_test = linear_model.predict(X_test)

# Calculate metrics for the simple linear regression model
mse_linear_train = mean_squared_error(y_train, y_pred_linear_train)
mse_linear_test = mean_squared_error(y_test, y_pred_linear_test)
rmse_linear_train = np.sqrt(mse_linear_train)
rmse_linear_test = np.sqrt(mse_linear_test)
r2_linear_train = r2_score(y_train, y_pred_linear_train)
r2_linear_test = r2_score(y_test, y_pred_linear_test)

# 2. Polynomial Regression Model with Basis Functions
# Specify the degree for polynomial basis functions
poly_degree = 3

# Create polynomial features based on the specified degree
poly = PolynomialFeatures(degree=poly_degree)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

# Initialize and train a linear regression model on the polynomial features
poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)

# Make predictions for training and testing sets with the polynomial model
y_pred_poly_train = poly_model.predict(X_poly_train)
y_pred_poly_test = poly_model.predict(X_poly_test)

# Calculate metrics for the polynomial regression model
mse_poly_train = mean_squared_error(y_train, y_pred_poly_train)
mse_poly_test = mean_squared_error(y_test, y_pred_poly_test)
rmse_poly_train = np.sqrt(mse_poly_train)
rmse_poly_test = np.sqrt(mse_poly_test)
r2_poly_train = r2_score(y_train, y_pred_poly_train)
r2_poly_test = r2_score(y_test, y_pred_poly_test)

# Compile the results for easy comparison
results = {
    "Model": ["Linear Regression", "Polynomial Regression (degree 3)"],
    "Train MSE": [mse_linear_train, mse_poly_train],
    "Test MSE": [mse_linear_test, mse_poly_test],
    "Train RMSE": [rmse_linear_train, rmse_poly_train],
    "Test RMSE": [rmse_linear_test, rmse_poly_test],
    "Train R^2": [r2_linear_train, r2_poly_train],
    "Test R^2": [r2_linear_test, r2_poly_test]
}

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results)
print(results_df)


                              Model  Train MSE  Test MSE  Train RMSE  \
0                 Linear Regression  10.011343  9.401378    3.164071   
1  Polynomial Regression (degree 3)   1.384985  1.348389    1.176854   

   Test RMSE  Train R^2  Test R^2  
0   3.066167   0.648596   0.67816  
1   1.161202   0.951386   0.95384  
