In [5]:
# --- Step 1: Load and Split Data ---
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load data directly from my Week 1 GitHub link
url = "https://raw.githubusercontent.com/KhushiRaghuwanshi20/Carbon_Footprint_Prediction_week-1/main/final_carbon_footprint_dataset.csv"
df = pd.read_csv(url)

# 2. Separate questions (X) from answer (y)
X = df.drop('total_carbon_impact', axis=1)
y = df['total_carbon_impact']

# 3. Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("--- Data loaded and split successfully! ---")
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

--- Data loaded and split successfully! ---
Training set size: (15788, 4)
Testing set size: (3947, 4)


In [6]:
# --- Step 2: Train the Model ---
from sklearn.linear_model import LinearRegression

# 1. Create the model
model = LinearRegression()

# 2. Train the model on the 80% training data
model.fit(X_train, y_train)

print("--- Model training complete! ---")

--- Model training complete! ---


In [7]:
# --- Step 3: Prediction and Accuracy Check ---
from sklearn.metrics import r2_score, mean_absolute_error

# 1. Make predictions on the test data
y_pred = model.predict(X_test)

# 2. Check accuracy
accuracy = r2_score(y_test, y_pred)
error = mean_absolute_error(y_test, y_pred)

print("--- Model Performance Report ---")
print(f"Accuracy (R2 Score): {accuracy:.4f}")
print(f"Average Error: {error:.4f}")

# 3. Compare Real vs Predicted
print("\n--- Comparison (First 5 Rows) ---")
results = pd.DataFrame({'Real Value': y_test.values, 'Predicted Value': y_pred})
print(results.head())

--- Model Performance Report ---
Accuracy (R2 Score): 0.9843
Average Error: 4.0299

--- Comparison (First 5 Rows) ---
   Real Value  Predicted Value
0   24.534541        22.639443
1   39.688040        44.200422
2   21.155421        22.617891
3   34.789357        27.512690
4   41.991757        43.241715
