In [None]:
# Import necessary libraries
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import json
from datetime import datetime

## AWS DynamoDB Setup and Data Analysis
This notebook demonstrates how to interact with AWS DynamoDB, retrieve data from a venues table, and perform data

In [None]:
# Set up AWS client for DynamoDB
try:
    # Configure AWS credentials (if not using IAM roles)
    # You would typically use environment variables or AWS credentials file
    session = boto3.Session(region_name='us-east-1')
    dynamodb = session.resource('dynamodb')
    dynamodb_client = session.client('dynamodb')

    # Connect to the venues table
    venues_table = dynamodb.Table('venues')
    print("Successfully connected to DynamoDB")
except Exception as e:
    print(f"Error connecting to AWS

In [None]:
# Function to scan all items from the DynamoDB table
def scan_table(table_name):
    """
    Scan all items from a DynamoDB table
    """
    table = dynamodb.Table(table_name)
    response = table.scan()
    items = response['Items']

    # Handle pagination if there are more items
    while 'LastEvaluatedKey' in response:
        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        items.extend(response['Items'])

In [None]:
# Retrieve all venues from the table
try:
    venues_data = scan_table('venues')
    print(f"Retrieved {len(venues_data)} venues from DynamoDB")
except Exception as e:
    print(f"Error retrieving data: {e}")
    # Create sample data for demonstration if connection fails
    venues_data = [
        {"venueID": "v001", "name": "Stadium Arena", "capacity": 50000, "location": "New York", "openDate": "2010-05-15"},
        {"venueID": "v002", "name": "Concert Hall", "capacity": 2500, "location": "Los Angeles", "openDate": "2015-11-23"},
        {"venueID": "v003", "name": "Theater Complex", "capacity": 1200, "location": "Chicago", "openDate": "2018-03-07"},
        {"venueID": "v004", "name": "Music Pavilion", "capacity": 8000, "location": "Miami", "openDate": "2012-08-30"},
        {"venueID": "v005", "name": "Sports Center", "capacity": 15000, "location": "Dallas", "openDate": "2014-01-18"}
    ]
    print("Using sample data for demonstration")

In [None]:
# Convert venue data to a pandas DataFrame
venues_df = pd.DataFrame(venues_data)

# Display the DataFrame
venues_df

In [None]:
# Basic data cleaning and type conversion
if 'capacity' in venues_df.columns:
    venues_df['capacity'] = pd.to_numeric(venues_df['capacity'], errors='coerce')

if 'openDate' in venues_df.columns:
    venues_df['openDate'] = pd.to_datetime(venues_df['openDate'], errors='coerce')
    venues_df['yearsOperating'] = (datetime.now().year - venues_df['openDate'].dt.year)

# Check for missing values
print("Missing values in each column:")
print(venues_df.isnull().sum())

In [None]:
# Data visualization - Venue capacities
if 'capacity' in venues_df.columns:
    plt.figure(figsize=(12, 6))
    sns.barplot(x='venueID', y='capacity', data=venues_df)
    plt.title('Venue Capacities')
    plt.xlabel('Venue ID')
    plt.ylabel('Capacity')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show

In [None]:
# Distribution of venue capacities
if 'capacity' in venues_df.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(venues_df['capacity'], kde=True)
    plt.title('Distribution of Venue Capacities')
    plt.xlabel('Capacity')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Location analysis
if 'location' in venues_df.columns:
    location_counts = venues_df['location'].value_counts()

    plt.figure()