# Electricity Price Prediction
## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
%matplotlib inline

## 2. Load and Explore Data

In [None]:
# Load the datasets
train_data = pd.read_csv("2018_CI_Assignment_Training_Data.csv")
test_data = pd.read_csv("2018_CI_Assignment_Testing_Data.csv")

# Display first few rows
print("Training Data Head:")
display(train_data.head())

# Basic statistics
print("\nTraining Data Description:")
display(train_data.describe())

## 3. Data Visualization - Price Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_data.iloc[:, 6], bins=30, kde=True)
plt.title('Distribution of Electricity Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

## 4. Data Visualization - Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(train_data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 5. Data Preprocessing - Outlier Removal

In [None]:
def remove_outliers(data, col_idx=6):
    """Remove outliers using IQR method"""
    q1 = np.percentile(data.iloc[:, col_idx], 25)
    q3 = np.percentile(data.iloc[:, col_idx], 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return data[(data.iloc[:, col_idx] >= lower_bound) & (data.iloc[:, col_idx] <= upper_bound)]

# Remove outliers
train_data_clean = remove_outliers(train_data)
print("Original training data size:", len(train_data))
print("Training data size after removing outliers:", len(train_data_clean))

## 6. Prepare Features and Target

In [None]:
# Prepare features and target
X_train = train_data_clean.iloc[:, :-1]  # All columns except the last one
X_test = test_data.iloc[:, :-1]
y_train = train_data_clean.iloc[:, -1]   # Last column is the target
y_test = test_data.iloc[:, -1]

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeature scaling complete")
print("Training set shape:", X_train_scaled.shape)
print("Test set shape:", X_test_scaled.shape)

## 7. Train Linear Regression Model

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)
print("Model training complete")

## 8. Model Evaluation

In [None]:
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nModel Performance:")
print(f"Train RMSE: {train_rmse:.2f}")
print(f"Test RMSE: {test_rmse:.2f}")
print(f"Train R² Score: {train_r2:.4f}")
print(f"Test R² Score: {test_r2:.4f}")

## 9. Visualize Predictions vs Actual

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.tight_layout()
plt.show()

## 10. Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.abs(model.coef_)
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

# Display feature importance as a table
print("\nFeature Importance:")
display(feature_importance)