In [1]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.4.2-py3-none-any.whl.metadata (4.9 kB)
Downloading pymongo-4.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (677 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m677.1/677.1 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading dnspython-2.4.2-py3-none-any.whl (300 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.4.2 pymongo-4.6.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from pymongo import MongoClient

# Connect to MongoDB
try:
    mongo_client = MongoClient("mongodb+srv://m001-student:s3cret@cluster0.xyr2yuj.mongodb.net/?retryWrites=true&w=majority")
    database_name = "weather_db"
    collection_name = "weather_collection"
    db = mongo_client[database_name]
    collection = db[collection_name]
except Exception as e:
    print(f"Error connecting to MongoDB: {e}")
    exit()

# Query all documents from the collection
try:
    cursor = collection.find({})
except Exception as e:
    print(f"Error querying MongoDB collection: {e}")
    exit()

# Extract data from MongoDB cursor
data_list = []
for document in cursor:
    try:
        city = document.get("city", "")
        hourly_data = document.get("hourly_data", [])

        for hourly_entry in hourly_data:
            data_list.append({
                "DateTime": hourly_entry.get("DateTime", ""),
                "City": city,
                "Temperature": hourly_entry.get("Temperature", ""),
                "Pressure": hourly_entry.get("Pressure", ""),
                "Humidity": hourly_entry.get("Humidity", ""),
                "Wind Speed": hourly_entry.get("Wind Speed", ""),
                "Weather Description": hourly_entry.get("Weather Description", "")
            })
    except Exception as e:
        print(f"Error processing document: {e}")

# Create a DataFrame from the list of dictionaries
try:
    df = pd.DataFrame(data_list)
except Exception as e:
    print(f"Error creating DataFrame: {e}")
    exit()

# Convert 'DateTime' column to datetime type
try:
    df['DateTime'] = pd.to_datetime(df['DateTime'])
except Exception as e:
    print(f"Error converting 'DateTime' column to datetime type: {e}")
    exit()

# Display the DataFrame
print(df)

                DateTime       City  Temperature  Pressure  Humidity  \
0    2024-01-13 10:00:00  San Diego       282.00      1019        67   
1    2024-01-13 11:00:00  San Diego       281.04      1019        70   
2    2024-01-13 12:00:00  San Diego       281.93      1019        67   
3    2024-01-13 13:00:00  San Diego       282.76      1019        64   
4    2024-01-13 14:00:00  San Diego       283.56      1018        59   
...                  ...        ...          ...       ...       ...   
4939 2024-01-18 05:00:00  Charlotte       272.56      1026        52   
4940 2024-01-18 06:00:00  Charlotte       272.34      1025        53   
4941 2024-01-18 07:00:00  Charlotte       272.18      1026        52   
4942 2024-01-18 08:00:00  Charlotte       272.04      1026        51   
4943 2024-01-18 09:00:00  Charlotte       271.91      1025        50   

      Wind Speed Weather Description  
0           0.38           clear sky  
1           0.54           clear sky  
2           0.95  

In [8]:
df

Unnamed: 0,DateTime,City,Temperature,Pressure,Humidity,Wind Speed,Weather Description
0,2024-01-13 10:00:00,San Diego,282.00,1019,67,0.38,clear sky
1,2024-01-13 11:00:00,San Diego,281.04,1019,70,0.54,clear sky
2,2024-01-13 12:00:00,San Diego,281.93,1019,67,0.95,clear sky
3,2024-01-13 13:00:00,San Diego,282.76,1019,64,1.59,clear sky
4,2024-01-13 14:00:00,San Diego,283.56,1018,59,1.39,clear sky
...,...,...,...,...,...,...,...
4939,2024-01-18 05:00:00,Charlotte,272.56,1026,52,1.87,clear sky
4940,2024-01-18 06:00:00,Charlotte,272.34,1025,53,2.01,clear sky
4941,2024-01-18 07:00:00,Charlotte,272.18,1026,52,2.02,clear sky
4942,2024-01-18 08:00:00,Charlotte,272.04,1026,51,1.83,clear sky


In [9]:
import boto3
import io
import os

bucket = 'big-data-rmit-asm3'
prefix = 'streaming-data/'

# Create a S3 resource using boto3
s3_resource = boto3.Session().resource('s3')

def upload_s3_csv(filename, dataframe):
    """
    Upload a DataFrame to an S3 bucket as a CSV file

    :param filename: The filename to save as in the S3 bucket
    :param dataframe: The DataFrame to save
    """
    csv_buffer = io.StringIO()
    # Convert the DataFrame to CSV and save it to a buffer
    dataframe.to_csv(csv_buffer, header=True, index=False)

    # Full path for the file in the bucket
    full_file_path = os.path.join(prefix, filename)

    # Upload the CSV file to S3
    s3_resource.Bucket(bucket).Object(full_file_path).put(Body=csv_buffer.getvalue())
    print(f"File '{filename}' uploaded to '{full_file_path}' in bucket '{bucket}'.")

# Example usage
file_name = 'streaming_data.csv'
# Assume df is your DataFrame that you want to upload
upload_s3_csv(file_name, df)

File 'streaming_data.csv' uploaded to 'streaming-data/streaming_data.csv' in bucket 'big-data-rmit-asm3'.
