# Day 8: Supervised Learning

This notebook covers Linear and Logistic Regression with practical examples.

## 1. Setting up the Environment

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error, r2_score, 
    accuracy_score, confusion_matrix, 
    classification_report, roc_curve, auc
)

# Set random seed for reproducibility
np.random.seed(42)

## 2. Linear Regression: House Price Prediction

In [None]:
# Generate house price dataset
n_samples = 1000

# Features
size = np.random.normal(2000, 500, n_samples)
bedrooms = np.random.randint(1, 6, n_samples)
age = np.random.uniform(0, 50, n_samples)
distance_to_city = np.random.uniform(1, 30, n_samples)

# Target (house price)
price = (
    200000 +                     # base price
    150 * size +                 # price per sq ft
    25000 * bedrooms -          # price per bedroom
    2000 * age -                # depreciation with age
    5000 * distance_to_city +   # location factor
    np.random.normal(0, 50000, n_samples)  # random noise
)

# Create DataFrame
house_data = pd.DataFrame({
    'size': size,
    'bedrooms': bedrooms,
    'age': age,
    'distance_to_city': distance_to_city,
    'price': price
})

# Split features and target
X = house_data.drop('price', axis=1)
y = house_data['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression Results:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_model.coef_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('Coefficient', ascending=False))

# Visualize actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted House Prices')
plt.show()

## 3. Logistic Regression: Student Pass/Fail Prediction

In [None]:
# Generate student performance dataset
n_samples = 1000

# Features
study_hours = np.random.normal(7, 2, n_samples)
sleep_hours = np.random.normal(7, 1.5, n_samples)
attendance = np.random.uniform(0.6, 1.0, n_samples)

# Calculate probability of passing
z = 0.3 * study_hours + 0.2 * sleep_hours + 0.5 * attendance * 10
prob_pass = 1 / (1 + np.exp(-z))
passed = (prob_pass > 0.5).astype(int)

# Create DataFrame
student_data = pd.DataFrame({
    'study_hours': study_hours,
    'sleep_hours': sleep_hours,
    'attendance': attendance,
    'passed': passed
})

# Split features and target
X = student_data.drop('passed', axis=1)
y = student_data['passed']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
log_model = LogisticRegression(random_state=42)
log_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = log_model.predict(X_test_scaled)
y_pred_prob = log_model.predict_proba(X_test_scaled)[:, 1]

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

## 4. Model Interpretation and Feature Importance

In [None]:
# Linear Regression Coefficients
print("Linear Regression Model Interpretation:")
for feature, coef in zip(X.columns, lr_model.coef_):
    print(f"{feature}: A one standard deviation increase leads to a ${coef:,.2f} change in price")

# Logistic Regression Coefficients
print("\nLogistic Regression Model Interpretation:")
log_coef = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_model.coef_[0],
    'Odds Ratio': np.exp(log_model.coef_[0])
})
print(log_coef)

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.bar(X.columns, np.abs(lr_model.coef_))
plt.title('Linear Regression\nFeature Importance')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
plt.bar(X.columns, np.abs(log_model.coef_[0]))
plt.title('Logistic Regression\nFeature Importance')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()