In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df= pd.read_csv('fuel_consumption_dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# feature_columns=['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY','FUELCONSUMPTION_COMB','FUELCONSUMPTION_COMB_MPG','CO2EMISSIONS']

# for i in feature_columns:
#     q1, q3 = np.percentile(df[i], [25, 75])
#     iqr = q3 - q1
#     lower = q1 - 1.5 * iqr
#     upper = q3 + 1.5 * iqr
    
#     clean_df = df[(df[i] >= lower) & (df[i] <= upper)]

In [None]:
# clean_df.describe()

In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"\nCategorical columns: {list(categorical_cols)}")

for col in categorical_cols:
    print(f"\n{col} unique values: {df[col].nunique()}")
    if df[col].nunique() < 10:
        print(f"Values: {df[col].unique()}")


numeric_cols = df.select_dtypes(include=[np.number]).columns


corr_matrix = clean_df[numeric_cols].corr()
print("\n--- Correlation Matrix ---")
print(corr_matrix)

plt.figure()
sns.heatmap(corr_matrix,cmap='coolwarm')
plt.show()

print(corr_matrix['CO2EMISSIONS'].sort_values())

In [None]:
feature_columns = ['ENGINESIZE', 'CYLINDERS', 'FUELCONSUMPTION_CITY', 
                   'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB']

# Prepare features and target
X = clean_df[feature_columns]
y = clean_df['CO2EMISSIONS']


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

In [None]:
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred=model.predict(X_test)


In [None]:
# 8. Evaluate the Model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
print(f"\n--- Model Evaluation ---")
print(f"R-squared (R²) value: {r2:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

In [None]:
# 9. Generate and Save Plot
plt.figure() # Explicitly create a new figure
plt.scatter(y_test, y_pred, label='Actual vs. Predicted')
        
# Add a diagonal line for reference
min_val = min(y_test.min(), y_pred.min())
max_val = max(y_test.max(), y_pred.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, )    
plt.xlabel("Actual CO2 Emissions")
plt.ylabel("Predicted CO2 Emissions")
plt.title(f"Actual vs. Predicted CO2 Emissions (R² = {r2:.4f})")
plt.legend()
plt.grid(True)