In [3]:
# --- Step 2: First Dataset - Household Energy (19,000+ rows!) ---

# 1. Import necessary libraries, pandas and numpy
import pandas as pd
import numpy as np

# 2. URL for the Household Energy dataset (this is precise sensor data)
url_energy = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"

# 3. Read the CSV file into a pandas DataFrame
df_energy = pd.read_csv(url_energy)

# 4. We will select the 'Appliances' (energy) and 'T_out' (outside temp) columns
# These are both good 'numeric' features for our model
df_energy_final = df_energy[['Appliances', 'T_out']]

# 5. Rename the columns to be more understandable
df_energy_final = df_energy_final.rename(columns={
    'Appliances': 'home_energy_wh',
    'T_out': 'outside_temp_C'
})

# 6. ---- CRITICAL: Check the size of the data ----
print("--- Dataset 1 (Household Energy) is Ready! ---")
print(f"Total rows: {len(df_energy_final)}")

# 7. Print the first 5 rows to see what the data looks like
print("\n--- First 5 rows of Energy Data ---")
print(df_energy_final.head())

--- Dataset 1 (Household Energy) is Ready! ---
Total rows: 19735

--- First 5 rows of Energy Data ---
   home_energy_wh  outside_temp_C
0              60        6.600000
1              60        6.483333
2              50        6.366667
3              50        6.250000
4              60        6.133333


In [4]:
# --- Step 3: Second Dataset - Vehicle MPG ---

# 1. URL for the vehicle dataset (Auto MPG from UCI)
url_mpg = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

# 2. This data file doesn't have a header, so we must provide the column names
column_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

# 3. Read the CSV.
# na_values="?" tells pandas to treat "?" as a missing (NaN) value
# delim_whitespace=True tells pandas the columns are separated by spaces, not commas
df_mpg = pd.read_csv(url_mpg, names=column_names, na_values="?", delim_whitespace=True)

# 4. ---- Data Cleaning ----
# The 'horsepower' column has 6 missing values.
# We will fill these gaps with the average (mean) horsepower of all other cars.
# This is a key part of your Week 1 task.
df_mpg['horsepower'] = df_mpg['horsepower'].fillna(df_mpg['horsepower'].mean())

# 5. For our project, we only need the 'mpg' (mileage) and 'weight' columns
df_vehicle_final = df_mpg[['mpg', 'weight']]

# 6. Print the total size of this dataset
print("--- Dataset 2 (Vehicle MPG) is Cleaned and Ready! ---")
print(f"Total rows: {len(df_vehicle_final)}")

# 7. Print the first 5 rows to see what it looks like
print("\n--- First 5 rows of Vehicle Data ---")
print(df_vehicle_final.head())

  df_mpg = pd.read_csv(url_mpg, names=column_names, na_values="?", delim_whitespace=True)


--- Dataset 2 (Vehicle MPG) is Cleaned and Ready! ---
Total rows: 398

--- First 5 rows of Vehicle Data ---
    mpg  weight
0  18.0  3504.0
1  15.0  3693.0
2  18.0  3436.0
3  16.0  3433.0
4  17.0  3449.0


In [6]:
# --- Step 4: Combine Datasets and Pre-process ---

# 1. Create a copy of the large energy dataset to work on
df_final = df_energy_final.copy()

# 2. Get the lists of possible vehicle 'mpg' and 'weight' values
mpg_values = df_vehicle_final['mpg'].values
weight_values = df_vehicle_final['weight'].values

# 3. Get the total number of rows from our energy data (19735)
num_rows = len(df_final)

# 4. ---- Feature Engineering (Part 1) ----
# Assign a random car (mpg, weight) to each of the 19,735 energy readings

# Create a new 'vehicle_mpg' column
df_final['vehicle_mpg'] = np.random.choice(mpg_values, size=num_rows)

# Create a new 'vehicle_weight' column
df_final['vehicle_weight'] = np.random.choice(weight_values, size=num_rows)

# 5. ---- Feature Engineering (Part 2) ----
# Create our Target Variable (the 'answer' we want the AI to predict)
# We will call it 'total_carbon_impact'.

# We will create it using a formula based on our features.
# This ensures the data has a real, predictable pattern.
# (Energy * 0.4) + (Weight/MPG * 0.05) - (Temp * 0.1)

energy_impact = df_final['home_energy_wh'] * 0.4
vehicle_impact = (df_final['vehicle_weight'] / df_final['vehicle_mpg']) * 0.05
temp_impact = df_final['outside_temp_C'] * 0.1

# Add a little bit of random "noise" to make the data realistic
noise = np.random.normal(0, 5, size=num_rows)

# Combine them to create the final target column
df_final['total_carbon_impact'] = energy_impact + vehicle_impact - temp_impact + noise

# 6. ---- Final Check ----
# All columns are now precise, numeric, and ready for an AI model.
print("--- Final Combined & Pre-processed Dataset is Ready! ---")
print(f"Total rows: {len(df_final)}")
print(f"Total columns: {len(df_final.columns)}")

print("\n--- First 5 rows of the FINAL Dataset ---")
print(df_final.head())

--- Final Combined & Pre-processed Dataset is Ready! ---
Total rows: 19735
Total columns: 5

--- First 5 rows of the FINAL Dataset ---
   home_energy_wh  outside_temp_C  vehicle_mpg  vehicle_weight  \
0              60        6.600000         13.0          3365.0   
1              60        6.483333         32.0          2933.0   
2              50        6.366667         31.6          2525.0   
3              50        6.250000         26.0          4425.0   
4              60        6.133333         16.5          4190.0   

   total_carbon_impact  
0            27.147640  
1            40.910267  
2            15.723166  
3            28.682310  
4            39.263961  


In [7]:
# --- Step 5: Save and Download the Final Dataset ---

# 1. Define the filename for our final dataset
final_filename = 'final_carbon_footprint_dataset.csv'

# 2. Save the final DataFrame (df_final) to this CSV file
# index=False means we don't save the row numbers (0, 1, 2, etc.)
df_final.to_csv(final_filename, index=False)

print(f"--- Successfully saved data to {final_filename} ---")

# 3. Import the 'files' tool from Google Colab
from google.colab import files

# 4. Use the 'files' tool to download the CSV to your computer
print(f"--- Downloading {final_filename} to your computer... ---")
files.download(final_filename)

--- Successfully saved data to final_carbon_footprint_dataset.csv ---
--- Downloading final_carbon_footprint_dataset.csv to your computer... ---


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>