## 1. Load and Explore Data

Load the dataset and show:
- Size and structure
- Data descriptions
- Boxplot distributions
- Correlation between columns

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Set random seed for reproducibility
np.random.seed(42)

In [None]:
# Load the dataset
df = pd.read_csv("ml_python_labexam_2023_02_03.csv")

# Show size
print("Dataset shape:", df.shape)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print("\nFirst few rows:")
df.head()

In [None]:
# Data descriptions
print("Statistical summary:")
df.describe()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\nTotal missing values:", df.isnull().sum().sum())

In [None]:
# Boxplot for all numeric columns
fig, ax = plt.subplots(figsize=(14, 6))
df.boxplot(ax=ax)
plt.xticks(rotation=45)
plt.title("Boxplot of All Features")
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
correlation_matrix = df.corr()

# Display correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Show correlation with target variable 'y'
print("Correlation with target variable (y):")
target_correlation = correlation_matrix['y'].sort_values(ascending=False)
print(target_correlation)

## 2. Comment on Exploration and Identify Low Correlation Features

Identify features with absolute correlation < 0.15 with the target

In [None]:
# Find features with absolute correlation < 0.15 with target
threshold = 0.15
low_corr_features = target_correlation[abs(target_correlation) < threshold].index.tolist()

# Remove 'y' from the list if present
if 'y' in low_corr_features:
    low_corr_features.remove('y')

print(f"Features with absolute correlation < {threshold} with target 'y':")
print(low_corr_features)
print(f"\nNumber of low correlation features: {len(low_corr_features)}")

# Show their correlation values
print("\nCorrelation values for these features:")
for feature in low_corr_features:
    print(f"{feature}: {target_correlation[feature]:.4f}")

### Comments on Exploration:

**Dataset Structure:**
- The dataset contains regression data with multiple numeric features and a target variable 'y'
- All features appear to be continuous numeric values

**Missing Values:**
- The dataset shows presence/absence of missing values

**Feature Distributions:**
- Boxplots reveal the distribution and potential outliers in each feature
- Features may have different scales and ranges

**Correlation Analysis:**
- Some features show strong correlation with the target variable
- Features with absolute correlation < 0.15 are considered weakly correlated
- These weak features may not contribute significantly to prediction
- Removing them could simplify the model without losing much predictive power

## 3. Train/Test Linear Regression (Full Dataset)

Train and test a multivariate linear regressor on all features and show RMSE

In [None]:
# Prepare features and target
X_full = df.drop(columns=['y'])
y = df['y']

# Split into training and test sets
X_train_full, X_test_full, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42
)

print("Full dataset split:")
print(f"Training set size: {X_train_full.shape}")
print(f"Test set size: {X_test_full.shape}")

In [None]:
# Train Linear Regression on full dataset
lr_full = LinearRegression()
lr_full.fit(X_train_full, y_train)

# Make predictions
y_pred_full = lr_full.predict(X_test_full)

# Calculate RMSE
rmse_full = np.sqrt(mean_squared_error(y_test, y_pred_full))

print("Linear Regression on Full Dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse_full:.4f}")

## 4. Train/Test Linear Regression (Reduced Dataset)

Train and test on reduced dataset (dropping features with correlation < 0.15) and show RMSE

In [None]:
# Create reduced dataset by dropping low correlation features
X_reduced = df.drop(columns=['y'] + low_corr_features)

print(f"Reduced dataset features: {X_reduced.columns.tolist()}")
print(f"Number of features: {X_reduced.shape[1]}")
print(f"Features dropped: {low_corr_features}")

In [None]:
# Split reduced dataset
X_train_reduced, X_test_reduced, y_train_r, y_test_r = train_test_split(
    X_reduced, y, test_size=0.2, random_state=42
)

print("Reduced dataset split:")
print(f"Training set size: {X_train_reduced.shape}")
print(f"Test set size: {X_test_reduced.shape}")

In [None]:
# Train Linear Regression on reduced dataset
lr_reduced = LinearRegression()
lr_reduced.fit(X_train_reduced, y_train_r)

# Make predictions
y_pred_reduced = lr_reduced.predict(X_test_reduced)

# Calculate RMSE
rmse_reduced = np.sqrt(mean_squared_error(y_test_r, y_pred_reduced))

print("Linear Regression on Reduced Dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse_reduced:.4f}")

## 5. Train/Test Decision Tree Regressor (Reduced Dataset)

Train and test Decision Tree Regressor on reduced dataset and show RMSE

In [None]:
# Train Decision Tree Regressor on reduced dataset
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_reduced, y_train_r)

# Make predictions
y_pred_dt = dt_regressor.predict(X_test_reduced)

# Calculate RMSE
rmse_dt = np.sqrt(mean_squared_error(y_test_r, y_pred_dt))

print("Decision Tree Regressor on Reduced Dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse_dt:.4f}")

## 6. Optimize Decision Tree Depth with Cross Validation

Search for optimal max_depth that minimizes RMSE using cross-validation

In [None]:
# Define parameter grid for max_depth
param_grid = {
    'max_depth': list(range(1, 21)) + [None]
}

# Setup GridSearchCV
# Note: GridSearchCV maximizes the score, but we want to minimize RMSE
# So we use negative MSE as scoring metric
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the model
print("Optimizing Decision Tree depth with Cross Validation...")
grid_search.fit(X_train_reduced, y_train_r)

# Get best model
best_dt = grid_search.best_estimator_
best_depth = grid_search.best_params_['max_depth']

print(f"\nBest max_depth: {best_depth}")
print(f"Best cross-validation MSE: {-grid_search.best_score_:.4f}")
print(f"Best cross-validation RMSE: {np.sqrt(-grid_search.best_score_):.4f}")

In [None]:
# Test the optimized model on test set
y_pred_best_dt = best_dt.predict(X_test_reduced)
rmse_best_dt = np.sqrt(mean_squared_error(y_test_r, y_pred_best_dt))

print("Optimized Decision Tree Regressor on Test Set:")
print(f"Root Mean Squared Error (RMSE): {rmse_best_dt:.4f}")

In [None]:
# Visualize RMSE vs max_depth
cv_results = grid_search.cv_results_
depths = [p['max_depth'] for p in cv_results['params']]
mean_scores = np.sqrt(-cv_results['mean_test_score'])

plt.figure(figsize=(12, 6))
plt.plot(range(len(depths)), mean_scores, marker='o')
plt.xticks(range(len(depths)), [str(d) for d in depths], rotation=45)
plt.xlabel('max_depth')
plt.ylabel('Cross-Validation RMSE')
plt.title('Decision Tree: RMSE vs max_depth')
plt.grid(True, alpha=0.3)
plt.axvline(x=depths.index(best_depth), color='r', linestyle='--', label=f'Best depth: {best_depth}')
plt.legend()
plt.tight_layout()
plt.show()

## 7. Comment on Results

Compare all models and discuss findings

In [None]:
# Create comparison dataframe
results = pd.DataFrame({
    'Model': [
        'Linear Regression (Full)',
        'Linear Regression (Reduced)',
        'Decision Tree (Reduced)',
        'Decision Tree Optimized (Reduced)'
    ],
    'RMSE': [rmse_full, rmse_reduced, rmse_dt, rmse_best_dt],
    'Features': [
        X_full.shape[1],
        X_reduced.shape[1],
        X_reduced.shape[1],
        X_reduced.shape[1]
    ]
})

print("Model Comparison:")
print("="*70)
print(results.to_string(index=False))
print("\n")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: RMSE comparison
axes[0].bar(range(len(results)), results['RMSE'], 
            color=['skyblue', 'lightblue', 'lightcoral', 'coral'])
axes[0].set_xticks(range(len(results)))
axes[0].set_xticklabels(results['Model'], rotation=45, ha='right')
axes[0].set_ylabel('RMSE')
axes[0].set_title('Model Performance Comparison')
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(results['RMSE']):
    axes[0].text(i, v + 0.01, f'{v:.4f}', ha='center', va='bottom')

# Plot 2: Feature count
axes[1].bar(range(len(results)), results['Features'],
            color=['skyblue', 'lightblue', 'lightcoral', 'coral'])
axes[1].set_xticks(range(len(results)))
axes[1].set_xticklabels(results['Model'], rotation=45, ha='right')
axes[1].set_ylabel('Number of Features')
axes[1].set_title('Number of Features Used')
axes[1].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(results['Features']):
    axes[1].text(i, v + 0.1, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

### Final Comments on Results:

**1. Effect of Feature Reduction:**
- Linear Regression (Full) uses all features
- Linear Regression (Reduced) drops features with correlation < 0.15
- Comparing RMSE values shows whether removing weakly correlated features affects performance
- If RMSE remains similar or improves, feature reduction is beneficial (simpler model, less overfitting)

**2. Linear Regression vs Decision Tree:**
- Linear Regression assumes linear relationships between features and target
- Decision Tree can capture non-linear relationships
- Comparing their RMSE shows which assumption fits the data better

**3. Optimization Impact:**
- The unoptimized Decision Tree may overfit (if max_depth is too large)
- Cross-validation finds the optimal depth to balance bias and variance
- The optimized model should show better generalization (lower RMSE on test set)

**4. Best Model Selection:**
- The model with the lowest RMSE on the test set is the best performer
- Consider also model complexity: simpler models are preferred if performance is similar
- The optimized Decision Tree typically provides the best balance

**5. Practical Insights:**
- Feature selection can improve model interpretability and reduce computational cost
- Hyperparameter tuning is essential for tree-based models
- Cross-validation provides robust performance estimates