# Task 1: Predictive Modeling (Regression) 
  Description: Build and evaluate a regression model to predict a continuous variable (e.g., house prices).
# Objectives:
    Split the dataset into training and testing sets.
    Train a linear regression model using scikit-learn.
    Evaluate the model using performance metrics like
    mean squared error (MSE) and R-squared.
    Experiment with multiple models (e.g., Decision Trees,
    Random Forest) and compare performance.
    Tools: Python, scikit-learn, pandas, matplotlib.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
# Load the cleaned dataset
df = pd.read_csv('Data/Data/4) house Prediction Data Set.csv')

In [5]:
df

Unnamed: 0,0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 396.90 4.98 24.00
0,0.02731 0.00 7.070 0 0.4690 6.4210 78...
1,0.02729 0.00 7.070 0 0.4690 7.1850 61...
2,0.03237 0.00 2.180 0 0.4580 6.9980 45...
3,0.06905 0.00 2.180 0 0.4580 7.1470 54...
4,0.02985 0.00 2.180 0 0.4580 6.4300 58...
...,...
500,0.06263 0.00 11.930 0 0.5730 6.5930 69...
501,0.04527 0.00 11.930 0 0.5730 6.1200 76...
502,0.06076 0.00 11.930 0 0.5730 6.9760 91...
503,0.10959 0.00 11.930 0 0.5730 6.7940 89...


In [None]:
#split the dataset into training and testing set
X = df.drop(df['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#train a linear regression model using
model=LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:


# Set up plotting style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)



# Step 1: Prepare Data


# Step 2: Split Data into Training and Testing Sets


# Step 3: Train and Evaluate Models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': 
    'Random Forest': 
}

# Initialize lists to store performance metrics
performance = {
    'Model': [],
    'MSE': [],
    'R-squared': []
}

# Create output directory for plots
os.makedirs('model_plots', exist_ok=True)

# Train, predict, evaluate, and plot for each model
for model_name, model in models.items():
    # Train the model
    
    
    # Calculate metrics

    
    # Store metrics
    performance['Model'].append(model_name)
    performance['MSE'].append(mse)
    performance['R-squared'].append(r2)
    
    # Plot actual vs predicted prices
    plt.figure()
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Price (Standardized)')
    plt.ylabel('Predicted Price (Standardized)')
    plt.title(f'Actual vs Predicted Prices ({model_name})')
    plt.savefig(f'model_plots/actual_vs_predicted_{model_name.lower().replace(" ", "_")}.png')
    plt.close()

# Step 4: Summarize Performance
performance_df = pd.DataFrame(performance)
performance_df = performance_df.sort_values('MSE')  # Sort by MSE (lower is better)

# Save performance table
performance_df.to_csv('model_performance.csv', index=False)

# Step 5: Generate Summary Report
report = f"""
Predictive Modeling Report
=========================
Dataset: Amazon Laptops (cleaned_amazon_laptops.csv)
Target Variable: Price (Standardized)
Date: September 30, 2025

1. Model Performance
-------------------
{performance_df.to_string(index=False)}

2. Insights
-----------
- Best Model: {performance_df.iloc[0]['Model']} (Lowest MSE: {performance_df.iloc[0]['MSE']:.4f}, Highest R-squared: {performance_df.iloc[0]['R-squared']:.4f})
- Linear Regression: Assumes linear relationships. Performance depends on feature correlations with Price.
- Decision Tree: Captures non-linear patterns but may overfit with default parameters.
- Random Forest: Ensemble method, likely more robust due to averaging multiple trees.
- MSE Interpretation: Lower MSE indicates better prediction accuracy. Since Price is standardized, MSE is in squared standardized units.
- R-squared Interpretation: Higher R-squared (closer to 1) indicates better fit. Negative R-squared (if any) suggests poor model performance.

3. Visualizations
-----------------
- Actual vs Predicted Price plots for each model are saved in 'model_plots' directory.
- A 45-degree line indicates perfect predictions. Points closer to the line suggest better model accuracy.

4. Recommendations
------------------
- If Random Forest performs best, consider hyperparameter tuning (e.g., n_estimators, max_depth) for further improvement.
- If Linear Regression underperforms, check feature correlations (from EDA) to confirm if linear assumptions hold.
- For production, consider feature engineering (e.g., extract brand from Title) or additional features (e.g., laptop specs).
- Evaluate feature importance from Random Forest to identify key predictors of Price.

Performance table saved as 'model_performance.csv'.
Visualizations saved in 'model_plots' directory.
"""

# Save the report
with open('model_report.txt', 'w') as f:
    f.write(report)

# Print performance table and confirmation
print("Model Performance:")
print(performance_df)
print("\nPerformance table saved as 'model_performance.csv'.")
print("Visualizations saved in 'model_plots' directory.")
print("Modeling report saved as 'model_report.txt'.")