In [3]:
import pandas as pd
import boto3
import json
import warnings 
import logging
import boto3
from botocore.exceptions import ClientError
warnings.filterwarnings('ignore')
import configparser

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

In [6]:
import boto3
import time

# AWS Glue parameters
region = 'us-west-2'
database_name = 'people_db1'
table_name = 'city_table'
s3_path = 's3://ieshaan-bucket-latest/input_data/input_large.csv'
crawler_name = 'crawl_people'


# Create a Glue client
glue = boto3.client('glue', region_name=region, aws_access_key_id=KEY, aws_secret_access_key=SECRET)

# Step 1: Create Database
glue.create_database(DatabaseInput={'Name': database_name})
print(f"Database {database_name} created.")

# Step 2: Create Table
table_input = {
    'Name': table_name,
    'TableType': 'EXTERNAL_TABLE',
    'Parameters': {
        'classification': 'csv',
        'skip.header.line.count': '1',
    },
    'StorageDescriptor': {
        'Location': s3_path,
        'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
        'OutputFormat': 'org.apache.hadoop.hive.ql.io.IgnoreKeyTextOutputFormat',
        'SerdeInfo': {
            'SerializationLibrary': 'org.apache.hadoop.hive.serde2.OpenCSVSerde',
            'Parameters': {
                'separatorChar': ',',
                'quoteChar': '"',
            }
        }
    },
}
glue.create_table(DatabaseName=database_name, TableInput=table_input)
print(f"Table '{table_name}' has been created in database '{database_name}'.")

# Step 3: Create Crawler
glue.create_crawler(
    Name=crawler_name,
    Role='arn:aws:iam::935670829844:role/dwhR',
    DatabaseName=database_name,
    Targets={'S3Targets': [{'Path': s3_path}]}
)
print(f"Crawler {crawler_name} created.")

# Step 4: Run Crawler
glue.start_crawler(Name=crawler_name)
print(f"Crawler {crawler_name} started.")

# Step 5: Wait for Crawler to finish
while True:
    response = glue.get_crawler(Name=crawler_name)
    crawler_status = response['Crawler']['State']

    if crawler_status == 'READY':
        print(f"Crawler {crawler_name} has finished.")
        break
    elif crawler_status == 'FAILED':
        print(f"Crawler {crawler_name} has failed.")
        break
    else:
        print(f"Crawler {crawler_name} is still running. Current status: {crawler_status}")

    time.sleep(60)  # Wait for 60 seconds before checking again


Database people_db1 created.
Table 'city_table' has been created in database 'people_db1'.
Crawler crawl_people created.
Crawler crawl_people started.
Crawler crawl_people is still running. Current status: RUNNING
Crawler crawl_people is still running. Current status: STOPPING
Crawler crawl_people is still running. Current status: STOPPING
Crawler crawl_people has finished.
