In [12]:
# ==========================================
# Part 1: Fuel Consumption -> Horsepower
# ==========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Load Data
# Make sure this filename matches exactly what is in your Colab files
file_name = 'FuelEconomy.csv'
df = pd.read_csv(file_name)

print("Data loaded. Shape:", df.shape)

# 2. Preprocessing
# Drop NaNs if present
if df.isnull().sum().sum() > 0:
    df = df.dropna()

# Define features (X) and target (y)
X = df[['Fuel Economy (MPG)']]
y = df['Horse Power']

# 3. Split Data (70% Train, 30% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Helper to calculate metrics
def get_metrics(model, X_tr, X_te, y_tr, y_te):
    pred_tr = model.predict(X_tr)
    pred_te = model.predict(X_te)
    return {
        'Train MSE': mean_squared_error(y_tr, pred_tr),
        'Train MAE': mean_absolute_error(y_tr, pred_tr),
        'Train R2': r2_score(y_tr, pred_tr),
        'Test MSE': mean_squared_error(y_te, pred_te),
        'Test MAE': mean_absolute_error(y_te, pred_te),
        'Test R2': r2_score(y_te, pred_te)
    }

results = {}

# Model A: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
results['Linear Regression'] = get_metrics(lr, X_train, X_test, y_train, y_test)

# Model B: Polynomial Regression (deg 2, 3, 4)
for d in [2, 3, 4]:
    # Transform features
    poly = PolynomialFeatures(degree=d, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    # Train
    poly_model = LinearRegression()
    poly_model.fit(X_train_poly, y_train)

    # Evaluate
    results[f'Poly (deg={d})'] = get_metrics(poly_model, X_train_poly, X_test_poly, y_train, y_test)

# 4. Show Results
results_df = pd.DataFrame(results).T
cols = ['Train MSE', 'Train MAE', 'Train R2', 'Test MSE', 'Test MAE', 'Test R2']
print("\nModel Performance Summary:")
display(results_df[cols])

Data loaded. Shape: (100, 2)

Model Performance Summary:


Unnamed: 0,Train MSE,Train MAE,Train R2,Test MSE,Test MAE,Test R2
Linear Regression,357.69918,16.061689,0.90632,318.561087,14.940628,0.912561
Poly (deg=2),350.879731,15.995824,0.908106,331.105434,15.14833,0.909118
Poly (deg=3),345.108668,15.746762,0.909618,318.404012,14.764973,0.912604
Poly (deg=4),339.700171,15.508465,0.911034,313.798757,14.735471,0.913868


# 新段落

In [13]:
# ==========================================
# Part 2: Weather -> Electricity Prediction
# ==========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Load Data
# Updated filename as per your request
file_name = 'electricity_consumption_based_weather_dataset.csv'
df2 = pd.read_csv(file_name)

print("Dataset Shape:", df2.shape)
# print(df2.columns) # Check columns if needed

# 2. Data Cleaning
# Remove missing values
if df2.isnull().sum().sum() > 0:
    df2 = df2.dropna()

# 3. Feature Selection
target_col = 'daily_consumption'

# Drop target and non-numeric columns (like Dates)
X = df2.drop(columns=[target_col], errors='ignore').select_dtypes(include=[np.number])
y = df2[target_col]

print(f"Features used: {list(X.columns)}")

# 4. Train/Test Split (70% Train, 30% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Helper function (if not already defined in Part 1 cell)
def get_metrics(model, X_tr, X_te, y_tr, y_te):
    pred_tr = model.predict(X_tr)
    pred_te = model.predict(X_te)
    return {
        'Train MSE': mean_squared_error(y_tr, pred_tr),
        'Train MAE': mean_absolute_error(y_tr, pred_tr),
        'Train R2': r2_score(y_tr, pred_tr),
        'Test MSE': mean_squared_error(y_te, pred_te),
        'Test MAE': mean_absolute_error(y_te, pred_te),
        'Test R2': r2_score(y_te, pred_te)
    }

results_p2 = {}

# Model A: Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
results_p2['Linear Regression'] = get_metrics(lr, X_train, X_test, y_train, y_test)

# Model B: Polynomial Regression (Deg 2, 3, 4)
for d in [2, 3, 4]:
    # Transform
    poly = PolynomialFeatures(degree=d, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    # Train
    model = LinearRegression()
    model.fit(X_train_poly, y_train)

    # Metrics
    results_p2[f'Poly (deg={d})'] = get_metrics(model, X_train_poly, X_test_poly, y_train, y_test)

# 5. Results Table
results_df_p2 = pd.DataFrame(results_p2).T
cols = ['Train MSE', 'Train MAE', 'Train R2', 'Test MSE', 'Test MAE', 'Test R2']

print("\n=== Part 2: Model Performance ===")
display(results_df_p2[cols])

Dataset Shape: (1433, 6)
Features used: ['AWND', 'PRCP', 'TMAX', 'TMIN']

=== Part 2: Model Performance ===


Unnamed: 0,Train MSE,Train MAE,Train R2,Test MSE,Test MAE,Test R2
Linear Regression,272403.396174,384.465016,0.276,248125.8,375.404537,0.299333
Poly (deg=2),264765.769932,379.648753,0.2963,255268.5,379.039083,0.279163
Poly (deg=3),259249.53487,375.952901,0.310961,265623.7,385.235167,0.249922
Poly (deg=4),251909.339001,372.116566,0.33047,12151490.0,578.642201,-33.313844
