In [None]:
# Import required libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# ---------------------------
# 1. Generate Dataset
# ---------------------------

# Seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate random leg lengths (between 1 and 100)
leg1 = np.random.uniform(1, 100, n_samples)
leg2 = np.random.uniform(1, 100, n_samples)

# Compute the hypotenuse using Pythagoras’ theorem
hypotenuse = np.sqrt(leg1**2 + leg2**2)

# Create a DataFrame
data = pd.DataFrame({
    'leg1': leg1,
    'leg2': leg2,
    'hypotenuse': hypotenuse
})

# Ensure the data folder exists
data_folder = os.path.join(os.getcwd(), 'data')
os.makedirs(data_folder, exist_ok=True)

# Save the dataset to CSV
dataset_path = os.path.join(data_folder, 'right_triangle_dataset.csv')
data.to_csv(dataset_path, index=False)
print(f"Dataset saved to {dataset_path}")

# ---------------------------
# 2. Train the Regression Model
# ---------------------------

# Load dataset (for demonstration, we use the same dataset we just saved)
df = pd.read_csv(dataset_path)

# Define features and target variable
X = df[['leg1', 'leg2']].values
y = df['hypotenuse'].values

# Split dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model instance
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)
print("Model training complete.")

# ---------------------------
# 3. Model Predictions and Evaluation
# ---------------------------

# Predict hypotenuse for test data
y_pred = model.predict(X_test)

# Evaluation Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.4f}")

# ---------------------------
# 4. Plotting Results
# ---------------------------

# Plot: Actual vs. Predicted Hypotenuse
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, edgecolors='k')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Hypotenuse")
plt.ylabel("Predicted Hypotenuse")
plt.title("Actual vs. Predicted Hypotenuse")
plt.grid(True)
plt.show()

# Plot: Residuals Histogram
residuals = y_test - y_pred
plt.figure(figsize=(8, 6))
plt.hist(residuals, bins=30, edgecolor='black', alpha=0.7)
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.title("Histogram of Residuals")
plt.grid(True)
plt.show()
