In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from pymongo import MongoClient, UpdateOne
from datetime import datetime

# 1. Setup Connection
load_dotenv()


True

In [2]:
username = os.getenv("MONGODB_USERNAME")
password = os.getenv("MONGODB_PASSWORD")
cluster_url = os.getenv("MONGODB_CLUSTER")
CONNECTION_STRING = f"mongodb+srv://{username}:{password}@{cluster_url}/"

client = MongoClient(CONNECTION_STRING)
db = client["aqi_prediction"]
fs = db["feature_store"]

In [3]:
cursor = fs.find({}, {"_id": 0})

# Convert to DataFrame
df = pd.DataFrame(list(cursor))

print(df.head())
print(df.shape)

Empty DataFrame
Columns: []
Index: []
(0, 0)


## Push CSV Data to MongoDB Feature Store

In [7]:
# Load CSV file
csv_file = '../backup/datav5.csv'  # Change to your CSV filename
data_df = pd.read_csv(csv_file)

print(f"Loaded {len(data_df)} records from {csv_file}")
print(f"Columns: {list(data_df.columns)}")
data_df.head()

Loaded 2862 records from ../backup/datav5.csv
Columns: ['timestamp', 'year', 'month', 'day', 'hour', 'epa_aqi', 'pm2_5', 'pm10', 'co', 'no2', 'so2', 'o3', 'temp_c', 'humidity_pct', 'pressure_hpa', 'wind_speed_kmh', 'wind_dir_deg', 'rain_mm', 'solar_rad_wm2', 'inserted_at']


Unnamed: 0,timestamp,year,month,day,hour,epa_aqi,pm2_5,pm10,co,no2,so2,o3,temp_c,humidity_pct,pressure_hpa,wind_speed_kmh,wind_dir_deg,rain_mm,solar_rad_wm2,inserted_at
0,1760418000.0,2025.0,10.0,14.0,10.0,3.0,27.55,66.19,128.87,0.16,1.39,112.52,30.0,49.0,1012.8,8.4,315.0,0.0,418.0,2026-01-25 07:30:56.738
1,1760540000.0,2025.0,10.0,15.0,20.0,3.0,35.21,86.83,130.97,0.2,1.7,109.08,27.2,63.0,1010.1,11.3,309.0,0.0,0.0,2026-01-25 07:30:56.738
2,1760562000.0,2025.0,10.0,16.0,2.0,3.0,33.95,81.22,128.22,0.12,1.43,107.92,24.7,77.0,1010.1,9.7,332.0,0.0,0.0,2026-01-25 07:30:56.738
3,1760594000.0,2025.0,10.0,16.0,11.0,3.0,32.95,84.19,130.34,0.2,1.75,107.16,33.8,19.0,1011.8,2.4,42.0,0.0,567.0,2026-01-25 07:30:56.738
4,1760638000.0,2025.0,10.0,16.0,23.0,3.0,33.51,86.48,130.98,0.18,1.21,99.81,26.6,54.0,1012.0,6.8,2.0,0.0,0.0,2026-01-25 07:30:56.738


In [10]:
# Convert DataFrame to list of dictionaries (MongoDB documents)
records = data_df.to_dict('records')

# Optional: Add metadata or timestamps
# for record in records:
#     record['inserted_at'] = datetime.utcnow()
#     # Handle NaN values (MongoDB doesn't like NaN)
#     for key, value in list(record.items()):
#         if pd.isna(value):
#             record[key] = None

print(f"Prepared {len(records)} records for insertion")
print(f"\nSample record:")
print(records[0])

Prepared 2862 records for insertion

Sample record:
{'timestamp': 1760418000.0, 'year': 2025.0, 'month': 10.0, 'day': 14.0, 'hour': 10.0, 'epa_aqi': 3.0, 'pm2_5': 27.55, 'pm10': 66.19, 'co': 128.87, 'no2': 0.16, 'so2': 1.39, 'o3': 112.52, 'temp_c': 30.0, 'humidity_pct': 49.0, 'pressure_hpa': 1012.8, 'wind_speed_kmh': 8.4, 'wind_dir_deg': 315.0, 'rain_mm': 0.0, 'solar_rad_wm2': 418.0, 'inserted_at': '2026-01-25 07:30:56.738'}


In [6]:
# Method 1: Insert all records (use if collection is empty or you want duplicates)
try:
    result = fs.insert_many(records)
    print(f"✓ Successfully inserted {len(result.inserted_ids)} records")
except Exception as e:
    print(f"✗ Error: {e}")

✓ Successfully inserted 2862 records


In [11]:
# Method 2: Upsert (update if exists, insert if not) - prevents duplicates
# Use timestamp as unique key to avoid duplicate records
operations = []
for record in records:
    operations.append(
        UpdateOne(
            {'timestamp': record['timestamp']},  # Match by timestamp
            {'$set': record},
            upsert=True
        )
    )

try:
    result = fs.bulk_write(operations)
    print(f"✓ Upsert completed:")
    print(f"  - Inserted: {result.upserted_count}")
    print(f"  - Modified: {result.modified_count}")
    print(f"  - Matched: {result.matched_count}")
except Exception as e:
    print(f"✗ Error: {e}")

✓ Upsert completed:
  - Inserted: 0
  - Modified: 2862
  - Matched: 2862


In [9]:
# Verify the data was inserted
count = fs.count_documents({})
print(f"\nTotal documents in feature_store: {count}")

# Show a sample
sample = fs.find_one({}, {"_id": 0})
print(f"\nSample document:")
print(sample)


Total documents in feature_store: 2862

Sample document:
{'timestamp': 1760418000.0, 'year': 2025.0, 'day': 14.0, 'hour': 10.0, 'epa_aqi': 3.0, 'pm2_5': 27.55, 'pm10': 66.19, 'co': 128.87, 'no2': 0.16, 'so2': 1.39, 'o3': 112.52, 'temp_c': 30.0, 'humidity_pct': 49.0, 'pressure_hpa': 1012.8, 'wind_speed_kmh': 8.4, 'wind_dir_deg': 315.0, 'rain_mm': 0.0, 'solar_rad_wm2': 418.0, 'inserted_at': '2026-01-25 07:30:56.738'}


## Optional: Create Index for Better Query Performance

In [None]:
# Create indexes for faster queries
# Index on timestamp (for time-based queries)
fs.create_index("timestamp", unique=True)
print("✓ Created index on 'timestamp'")

# Index on datetime if available (for date range queries)
if 'year' in data_df.columns and 'month' in data_df.columns:
    fs.create_index([("year", 1), ("month", 1), ("day", 1)])
    print("✓ Created compound index on year/month/day")

# List all indexes
print("\nCurrent indexes:")
for index in fs.list_indexes():
    print(f"  - {index}")