In [2]:
# --- 1. Import Necessary Libraries ---
# We only import the essentials for the core pipeline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error


In [3]:
# --- 2. Load and Briefly Inspect Data ---
try:
    df = pd.read_csv('fuel_consumption_dataset.csv')
    print("Dataset loaded successfully.")
    
    # Show the first few rows to understand the columns
    print(df.head())
    
    # Check for any obvious missing data
    print("\nMissing values per column:")
    print(df.isnull().sum())

except FileNotFoundError:
    print("Error: Dataset file not found. Make sure 'fuel_consumption_dataset.csv' is in the correct path.")
    # In a real scenario, you would stop here if the file isn't found.


Dataset loaded successfully.
   MODELYEAR   MAKE       MODEL VEHICLECLASS  ENGINESIZE  CYLINDERS  \
0       2014  ACURA         ILX      COMPACT         2.0          4   
1       2014  ACURA         ILX      COMPACT         2.4          4   
2       2014  ACURA  ILX HYBRID      COMPACT         1.5          4   
3       2014  ACURA     MDX 4WD  SUV - SMALL         3.5          6   
4       2014  ACURA     RDX AWD  SUV - SMALL         3.5          6   

  TRANSMISSION FUELTYPE  FUELCONSUMPTION_CITY  FUELCONSUMPTION_HWY  \
0          AS5        Z                   9.9                  6.7   
1           M6        Z                  11.2                  7.7   
2          AV7        Z                   6.0                  5.8   
3          AS6        Z                  12.7                  9.1   
4          AS6        Z                  12.1                  8.7   

   FUELCONSUMPTION_COMB  FUELCONSUMPTION_COMB_MPG  CO2EMISSIONS  
0                   8.5                        33        

In [4]:
# --- 3. Define Features (X) and Target (y) ---
# For an exam, you might pick fewer features. 
# The original notebook used 5 features.
# Let's use the 3 most impactful and less correlated ones for simplicity:
# ENGINESIZE, CYLINDERS, and FUELCONSUMPTION_COMB.
feature_columns = ['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_COMB']
target_column = 'CO2EMISSIONS'

X = df[feature_columns]
y = df[target_column]

In [None]:
# --- 4. Split Data into Training and Testing Sets ---
# A 80/20 split is a standard choice.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nData split into {len(X_train)} training samples and {len(X_test)} testing samples.")

In [None]:

# --- 5. Create and Train the Linear Regression Model ---
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained.")

In [None]:

# --- 6. Evaluate the Model on the Test Set ---
# We predict on the data the model has not seen before.
y_pred = model.predict(X_test)

# Calculate key performance metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("\n--- Model Evaluation (Test Set) ---")
print(f"R-squared (RÂ²): {r2:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

In [None]:
# --- 7. Simple Visualization (Actual vs. Predicted) ---
# This is a quick way to see how well the model's predictions line up with the actual values.
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Fit')
plt.title('Test Set: Actual vs. Predicted CO2 Emissions')
plt.xlabel('Actual CO2 Emissions')
plt.ylabel('Predicted CO2 Emissions')
plt.legend()
plt.grid(True)
plt.show()