In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

In [3]:
# --- Step 1: Load the datasets ---
print("="*60)
print("MACHINE LEARNING ASSIGNMENT - LINEAR REGRESSION")
print("="*60)

try:
    # Reading from the Hotel-Property-Value-Dataset folder
    train_df = pd.read_csv('../Hotel-Property-Value-Dataset/train.csv')
    test_df = pd.read_csv('../Hotel-Property-Value-Dataset/test.csv')
    sample_submission_df = pd.read_csv('../Hotel-Property-Value-Dataset/sample_submission.csv')
    print("‚úÖ Files loaded successfully!")
    print(f"Training data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
except FileNotFoundError as e:
    print(f"‚ùå Error: {e}")
    print("Ensure the Hotel-Property-Value-Dataset folder contains train.csv, test.csv, and sample_submission.csv")
    exit()

# --- Target Variable ---
TARGET_VARIABLE = "HotelValue"

if TARGET_VARIABLE not in train_df.columns:
    print(f"‚ùå Error: The target column '{TARGET_VARIABLE}' was not found in train.csv.")
    print(f"Available columns are: {list(train_df.columns)}")
    exit()

print(f"\nüìä Target variable: {TARGET_VARIABLE}")
print(f"Target statistics:\n{train_df[TARGET_VARIABLE].describe()}")

MACHINE LEARNING ASSIGNMENT - LINEAR REGRESSION
‚úÖ Files loaded successfully!
Training data shape: (1200, 81)
Test data shape: (260, 80)

üìä Target variable: HotelValue
Target statistics:
count      1200.000000
mean     181709.895833
std       77638.660223
min       34900.000000
25%      130000.000000
50%      165000.000000
75%      215000.000000
max      745000.000000
Name: HotelValue, dtype: float64


In [8]:
# --- Step 2: Data Preprocessing (Course Concepts) ---
print(f"\n" + "="*60)
print("STEP 2: DATA PREPROCESSING")
print("="*60)

# Separate features (X) from the target (y)
X_train_full = train_df.drop([TARGET_VARIABLE], axis=1)
y_train = train_df[TARGET_VARIABLE]
X_test_full = test_df.copy()

print(f"Features in training data: {X_train_full.shape[1]}")
print(f"Features in test data: {X_test_full.shape[1]}")

# Identify numeric and categorical columns
numeric_features = X_train_full.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train_full.select_dtypes(include=['object']).columns.tolist()

print(f"\nüìà Numeric features ({len(numeric_features)}): {numeric_features[:5]}..." if len(numeric_features) > 5 else f"\nüìà Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"üìù Categorical features ({len(categorical_features)}): {categorical_features[:5]}..." if len(categorical_features) > 5 else f"üìù Categorical features ({len(categorical_features)}): {categorical_features}")


STEP 2: DATA PREPROCESSING
Features in training data: 80
Features in test data: 80

üìà Numeric features (37): ['Id', 'PropertyClass', 'RoadAccessLength', 'LandArea', 'OverallQuality']...
üìù Categorical features (43): ['ZoningCategory', 'RoadType', 'ServiceLaneType', 'PlotShape', 'LandElevation']...


In [9]:
# --- Step 3: Handle Missing Values and Encode Categorical Variables ---
def preprocess_data(X_train, X_test, numeric_features, categorical_features):
    """
    Preprocess the data by handling missing values and encoding categorical variables
    Following course preprocessing concepts
    """
    X_train_processed = X_train.copy()
    X_test_processed = X_test.copy()
    
    # Handle numeric features - Fill with median (robust to outliers)
    for col in numeric_features:
        if col in X_train_processed.columns:
            median_val = X_train_processed[col].median()
            X_train_processed[col].fillna(median_val, inplace=True)
            X_test_processed[col].fillna(median_val, inplace=True)
    
    # Handle categorical features - Label encoding
    label_encoders = {}
    for col in categorical_features:
        if col in X_train_processed.columns:
            # Fill missing values with mode (most frequent value)
            mode_val = X_train_processed[col].mode()[0] if not X_train_processed[col].mode().empty else 'Unknown'
            X_train_processed[col].fillna(mode_val, inplace=True)
            X_test_processed[col].fillna(mode_val, inplace=True)
            
            # Label encode categorical variables
            le = LabelEncoder()
            # Fit on combined data to handle unseen categories in test set
            combined_data = pd.concat([X_train_processed[col], X_test_processed[col]], axis=0)
            le.fit(combined_data)
            
            X_train_processed[col] = le.transform(X_train_processed[col])
            X_test_processed[col] = le.transform(X_test_processed[col])
            label_encoders[col] = le
    
    return X_train_processed, X_test_processed, label_encoders

# Preprocess the data
X_train_processed, X_test_processed, label_encoders = preprocess_data(
    X_train_full, X_test_full, numeric_features, categorical_features
)

print(f"\nAfter preprocessing:")
print(f"‚úÖ Training data shape: {X_train_processed.shape}")
print(f"‚úÖ Test data shape: {X_test_processed.shape}")
print(f"‚úÖ Missing values in training data: {X_train_processed.isnull().sum().sum()}")
print(f"‚úÖ Missing values in test data: {X_test_processed.isnull().sum().sum()}")



After preprocessing:
‚úÖ Training data shape: (1200, 80)
‚úÖ Test data shape: (260, 80)
‚úÖ Missing values in training data: 0
‚úÖ Missing values in test data: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_processed[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_test_processed[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we a

In [10]:
# --- Step 4: Feature Scaling (Standardization) ---
print(f"\n" + "="*60)
print("STEP 3: FEATURE SCALING")
print("="*60)

# Scale all features using standardization (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)

print(f"‚úÖ Features scaled successfully!")
print(f"Training features shape: {X_train_scaled.shape}")
print(f"Test features shape: {X_test_scaled.shape}")
print(f"Feature means after scaling: {np.mean(X_train_scaled, axis=0)[:5]} (should be ~0)")
print(f"Feature stds after scaling: {np.std(X_train_scaled, axis=0)[:5]} (should be ~1)")

# Convert back to numpy arrays for mathematical operations
y_train = y_train.values


STEP 3: FEATURE SCALING
‚úÖ Features scaled successfully!
Training features shape: (1200, 80)
Test features shape: (260, 80)
Feature means after scaling: [-9.47390314e-17  1.77635684e-17  2.54611147e-16 -7.69754630e-17
 -6.21724894e-17] (should be ~0)
Feature stds after scaling: [1. 1. 1. 1. 1.] (should be ~1)


In [11]:
# === STEP 4: LINEAR REGRESSION IMPLEMENTATION FROM SCRATCH ===
# Following course concepts: Normal Equation Method

print(f"\n" + "="*60)
print("STEP 4: LINEAR REGRESSION - NORMAL EQUATION METHOD")
print("="*60)

# Add bias term (intercept) - Design Matrix X with ones column
X_b_train = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_b_test = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

print("üìê Normal Equation Implementation:")
print("Œ∏ = (X^T X)^(-1) X^T y")
print(f"Design matrix shape: {X_b_train.shape}")

try:
    # Normal equation: Œ∏ = (X^T X)^(-1) X^T y
    # Using pseudoinverse for numerical stability
    XTX = X_b_train.T.dot(X_b_train)
    XTy = X_b_train.T.dot(y_train)
    
    # Compute inverse using pseudoinverse (more stable than direct inverse)
    XTX_inv = np.linalg.pinv(XTX)
    weights = XTX_inv.dot(XTy)
    
    print("‚úÖ Model training completed successfully!")
    print(f"Number of parameters (including bias): {len(weights)}")
    print(f"Bias term (intercept): {weights[0]:.2f}")
    print(f"First 5 feature weights: {weights[1:6]}")
    
except np.linalg.LinAlgError as e:
    print(f"‚ùå Error in normal equation computation: {e}")
    print("The matrix might be singular. Consider using regularization.")
    exit()

# Make predictions
train_predictions = X_b_train.dot(weights)
test_predictions = X_b_test.dot(weights)


STEP 4: LINEAR REGRESSION - NORMAL EQUATION METHOD
üìê Normal Equation Implementation:
Œ∏ = (X^T X)^(-1) X^T y
Design matrix shape: (1200, 81)
‚úÖ Model training completed successfully!
Number of parameters (including bias): 81
Bias term (intercept): 181709.90
First 5 feature weights: [ -391.28508795 -4022.24917556 -1685.55390204 -4082.45730985
  4347.9637522 ]


In [12]:
# === STEP 5: ERROR FUNCTION ANALYSIS ===
# Following course concepts: Multiple Error Functions for Model Evaluation

print(f"\n" + "="*60)
print("STEP 5: ERROR FUNCTION ANALYSIS")
print("="*60)

def calculate_error_functions(y_true, y_pred):
    """
    Calculate multiple error functions as taught in course
    """
    n = len(y_true)
    
    # 1. Mean Squared Error (MSE) - L2 Loss
    mse = np.mean((y_true - y_pred) ** 2)
    
    # 2. Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    
    # 3. Mean Absolute Error (MAE) - L1 Loss
    mae = np.mean(np.abs(y_true - y_pred))
    
    # 4. R-squared (Coefficient of Determination)
    ss_res = np.sum((y_true - y_pred) ** 2)  # Residual sum of squares
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)  # Total sum of squares
    r2 = 1 - (ss_res / ss_tot)
    
    # 5. Adjusted R-squared
    n_features = X_train_scaled.shape[1]
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - n_features - 1)
    
    # 6. Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    
    return {
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R¬≤': r2,
        'Adjusted R¬≤': adj_r2,
        'MAPE': mape
    }

# Calculate error functions for training data
train_errors = calculate_error_functions(y_train, train_predictions)

print("üìä TRAINING SET ERROR ANALYSIS:")
print("-" * 40)
for metric, value in train_errors.items():
    if metric in ['MAPE']:
        print(f"{metric:12}: {value:.2f}%")
    elif metric in ['MSE']:
        print(f"{metric:12}: {value:,.0f}")
    elif metric in ['RMSE', 'MAE']:
        print(f"{metric:12}: ${value:,.2f}")
    else:
        print(f"{metric:12}: {value:.4f}")

# Analysis of prediction quality
print(f"\nüìà PREDICTION QUALITY ANALYSIS:")
print("-" * 40)
residuals = y_train - train_predictions
print(f"Mean residual: ${np.mean(residuals):,.2f}")
print(f"Std of residuals: ${np.std(residuals):,.2f}")
print(f"Min prediction: ${np.min(train_predictions):,.2f}")
print(f"Max prediction: ${np.max(train_predictions):,.2f}")
print(f"Predictions in range [0, max_actual]: {np.sum((train_predictions >= 0) & (train_predictions <= np.max(y_train))) / len(train_predictions) * 100:.1f}%")


STEP 5: ERROR FUNCTION ANALYSIS
üìä TRAINING SET ERROR ANALYSIS:
----------------------------------------
MSE         : 797,912,991
RMSE        : $28,247.35
MAE         : $17,996.59
R¬≤          : 0.8675
Adjusted R¬≤ : 0.8580
MAPE        : 10.44%

üìà PREDICTION QUALITY ANALYSIS:
----------------------------------------
Mean residual: $0.00
Std of residuals: $28,247.35
Min prediction: $28,764.84
Max prediction: $655,640.59
Predictions in range [0, max_actual]: 100.0%


In [13]:
# === STEP 6: MODEL DIAGNOSTICS AND VALIDATION ===
# Following course concepts: Model Validation and Diagnostics

print(f"\n" + "="*60)
print("STEP 6: MODEL DIAGNOSTICS AND VALIDATION")
print("="*60)

# 1. Prediction Examples (Sample Analysis)
print("üîç PREDICTION EXAMPLES (First 10 samples):")
print("-" * 50)
comparison_df = pd.DataFrame({
    'Actual': y_train[:10],
    'Predicted': train_predictions[:10],
    'Residual': y_train[:10] - train_predictions[:10],
    'Abs_Error': np.abs(y_train[:10] - train_predictions[:10])
})
print(comparison_df.round(2))

# 2. Model Complexity Analysis
print(f"\nüìä MODEL COMPLEXITY ANALYSIS:")
print("-" * 40)
print(f"Number of training samples: {len(y_train)}")
print(f"Number of features: {X_train_scaled.shape[1]}")
print(f"Parameters to data ratio: {len(weights)}/{len(y_train)} = {len(weights)/len(y_train):.4f}")

# 3. Weight Analysis
print(f"\n‚öñÔ∏è LEARNED PARAMETERS ANALYSIS:")
print("-" * 40)
print(f"Intercept (bias): {weights[0]:.4f}")
print(f"Largest positive weight: {np.max(weights[1:]):.4f}")
print(f"Largest negative weight: {np.min(weights[1:]):.4f}")
print(f"Weight standard deviation: {np.std(weights[1:]):.4f}")

# 4. Residual Analysis
print(f"\nüìâ RESIDUAL ANALYSIS:")
print("-" * 40)
residuals = y_train - train_predictions
print(f"Residual mean (should be ~0): {np.mean(residuals):.4f}")
print(f"Residual std: {np.std(residuals):.2f}")
print(f"Residual skewness: {np.mean(((residuals - np.mean(residuals)) / np.std(residuals)) ** 3):.4f}")

# 5. Prediction Bounds Analysis
negative_predictions = np.sum(train_predictions < 0)
if negative_predictions > 0:
    print(f"\n‚ö†Ô∏è WARNING: {negative_predictions} negative predictions detected!")
    print("This suggests the model may need constraints or regularization.")

print(f"\n‚úÖ MODEL VALIDATION COMPLETE!")


STEP 6: MODEL DIAGNOSTICS AND VALIDATION
üîç PREDICTION EXAMPLES (First 10 samples):
--------------------------------------------------
     Actual  Predicted   Residual  Abs_Error
0  395000.0  290355.17  104644.83  104644.83
1  165000.0  191453.87  -26453.87   26453.87
2  128200.0  122185.24    6014.76    6014.76
3  275000.0  250479.69   24520.31   24520.31
4  311872.0  342726.02  -30854.02   30854.02
5  214000.0  233483.13  -19483.13   19483.13
6  153500.0  183654.24  -30154.24   30154.24
7  144000.0  148961.69   -4961.69    4961.69
8  115000.0  118072.04   -3072.04    3072.04
9  180000.0  170391.32    9608.68    9608.68

üìä MODEL COMPLEXITY ANALYSIS:
----------------------------------------
Number of training samples: 1200
Number of features: 80
Parameters to data ratio: 81/1200 = 0.0675

‚öñÔ∏è LEARNED PARAMETERS ANALYSIS:
----------------------------------------
Intercept (bias): 181709.8958
Largest positive weight: 16213.7466
Largest negative weight: -21155.7561
Weight standa

In [14]:
# === STEP 7: MATHEMATICAL ANALYSIS ===
# Following course concepts: Mathematical foundations of Linear Regression

print(f"\n" + "="*60)
print("STEP 7: MATHEMATICAL ANALYSIS")
print("="*60)

# 1. Normal Equation Derivation Verification
print("üìê NORMAL EQUATION MATHEMATICAL VERIFICATION:")
print("-" * 50)
print("Objective: Minimize ||XŒ∏ - y||¬≤")
print("Solution: Œ∏ = (X·µÄX)‚Åª¬πX·µÄy")
print()

# Check if X·µÄX is invertible
XTX = X_b_train.T.dot(X_b_train)
det_XTX = np.linalg.det(XTX)
cond_number = np.linalg.cond(XTX)

print(f"X·µÄX matrix shape: {XTX.shape}")
print(f"X·µÄX determinant: {det_XTX:.2e}")
print(f"X·µÄX condition number: {cond_number:.2e}")

if cond_number > 1e12:
    print("‚ö†Ô∏è Warning: High condition number suggests near-singular matrix")
else:
    print("‚úÖ Matrix is well-conditioned")

# 2. Gradient Analysis (Verify optimality)
print(f"\nüéØ GRADIENT ANALYSIS (Optimality Check):")
print("-" * 50)
# At optimal solution, gradient should be zero: ‚àáf(Œ∏) = 2X·µÄXŒ∏ - 2X·µÄy = 0
gradient = 2 * X_b_train.T.dot(X_b_train.dot(weights) - y_train)
gradient_norm = np.linalg.norm(gradient)

print(f"Gradient norm at solution: {gradient_norm:.2e}")
if gradient_norm < 1e-10:
    print("‚úÖ Gradient is effectively zero - optimal solution found")
else:
    print("‚ö†Ô∏è Non-zero gradient - check numerical precision")

# 3. Degrees of Freedom Analysis
print(f"\nüî¢ DEGREES OF FREEDOM ANALYSIS:")
print("-" * 50)
n_samples = len(y_train)
n_params = len(weights)
df_residual = n_samples - n_params

print(f"Number of samples: {n_samples}")
print(f"Number of parameters: {n_params}")
print(f"Degrees of freedom (residual): {df_residual}")
print(f"Parameter/Sample ratio: {n_params/n_samples:.3f}")

# 4. Variance-Bias Analysis Framework
print(f"\n‚öñÔ∏è MODEL COMPLEXITY METRICS:")
print("-" * 50)
# Effective degrees of freedom and model complexity
trace_hat_matrix = np.trace(X_b_train.dot(np.linalg.pinv(XTX)).dot(X_b_train.T))
print(f"Effective degrees of freedom: {trace_hat_matrix:.2f}")
print(f"Model complexity index: {trace_hat_matrix/n_samples:.3f}")

if trace_hat_matrix/n_samples > 0.1:
    print("‚ö†Ô∏è High complexity model - potential overfitting risk")
else:
    print("‚úÖ Appropriate model complexity")


STEP 7: MATHEMATICAL ANALYSIS
üìê NORMAL EQUATION MATHEMATICAL VERIFICATION:
--------------------------------------------------
Objective: Minimize ||XŒ∏ - y||¬≤
Solution: Œ∏ = (X·µÄX)‚Åª¬πX·µÄy

X·µÄX matrix shape: (81, 81)
X·µÄX determinant: -2.18e+205
X·µÄX condition number: 4.65e+16

üéØ GRADIENT ANALYSIS (Optimality Check):
--------------------------------------------------


Gradient norm at solution: 1.42e-06
‚ö†Ô∏è Non-zero gradient - check numerical precision

üî¢ DEGREES OF FREEDOM ANALYSIS:
--------------------------------------------------
Number of samples: 1200
Number of parameters: 81
Degrees of freedom (residual): 1119
Parameter/Sample ratio: 0.068

‚öñÔ∏è MODEL COMPLEXITY METRICS:
--------------------------------------------------
Effective degrees of freedom: 79.00
Model complexity index: 0.066
‚úÖ Appropriate model complexity


In [None]:
# === STEP 8: KAGGLE SUBMISSION PREPARATION ===
# Final step: Prepare submission file for Kaggle competition

print(f"\n" + "="*60)
print("STEP 8: KAGGLE SUBMISSION PREPARATION")
print("="*60)

# Load test data and make predictions
print("üìÅ LOADING TEST DATA AND MAKING PREDICTIONS:")
print("-" * 50)

# Test data was already preprocessed in Step 2, use X_test_scaled
print(f"Test data shape: {X_test_scaled.shape}")

# Add bias term to test data
X_b_test = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]

# Make predictions using our trained model
test_predictions = X_b_test.dot(weights)

print(f"‚úÖ Test predictions generated for {len(test_predictions)} samples")
print(f"Prediction range: [{test_predictions.min():.2f}, {test_predictions.max():.2f}]")

# Create submission file using actual test IDs
submission_df = pd.DataFrame({
    'Id': test_df['Id'].values,  # Use actual IDs from test dataset
    'HotelValue': test_predictions  # Use HotelValue as per sample submission format
})

# Save submission file
submission_path = '/Users/hemanthmada/vscodeProjects/ml_assignment_1/submissions/linear_regression_1.csv'
submission_df.to_csv(submission_path, index=False)

print(f"üì§ SUBMISSION FILE CREATED:")
print("-" * 50)
print(f"File saved at: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print("\nFirst 5 predictions:")
print(submission_df.head())

# Final summary of the entire linear regression implementation
print(f"\n" + "="*60)
print("üéì COURSE PROJECT SUMMARY - LINEAR REGRESSION")
print("="*60)
print("‚úÖ Step 1: Data Loading and Exploration - COMPLETED")
print("‚úÖ Step 2: Data Preprocessing and Feature Engineering - COMPLETED")
print("‚úÖ Step 3: Normal Equation Implementation - COMPLETED")
print("‚úÖ Step 4: Model Training - COMPLETED")
print("‚úÖ Step 5: Comprehensive Error Function Analysis - COMPLETED")
print("‚úÖ Step 6: Model Diagnostics and Validation - COMPLETED")
print("‚úÖ Step 7: Mathematical Analysis - COMPLETED")
print("‚úÖ Step 8: Kaggle Submission Preparation - COMPLETED")
print("\nüèÜ LINEAR REGRESSION MODEL READY FOR COURSE EVALUATION!")


STEP 8: KAGGLE SUBMISSION PREPARATION
üìÅ LOADING TEST DATA AND MAKING PREDICTIONS:
--------------------------------------------------
Test data shape: (260, 80)
‚úÖ Test predictions generated for 260 samples
Prediction range: [-93673.17, 526824.15]
üì§ SUBMISSION FILE CREATED:
--------------------------------------------------
File saved at: /Users/hemanthmada/vscodeProjects/ml_assignment_1/submission.csv
Submission shape: (260, 2)

First 5 predictions:
     Id     HotelValue
0   893  148643.079254
1  1106  326768.889709
2   414  108922.284974
3   523  172606.454613
4  1037  316243.612234

üéì COURSE PROJECT SUMMARY - LINEAR REGRESSION
‚úÖ Step 1: Data Loading and Exploration - COMPLETED
‚úÖ Step 2: Data Preprocessing and Feature Engineering - COMPLETED
‚úÖ Step 3: Normal Equation Implementation - COMPLETED
‚úÖ Step 4: Model Training - COMPLETED
‚úÖ Step 5: Comprehensive Error Function Analysis - COMPLETED
‚úÖ Step 6: Model Diagnostics and Validation - COMPLETED
‚úÖ Step 7: Mathem