In [19]:
!pip install boto3

import boto3
import pandas as pd
import json
import csv
import os
import re 



In [20]:
s3 = boto3.client('s3')

In [21]:
def list_all_objects(bucket, prefix):
    all_objects = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' in response:
            all_objects.extend([obj['Key'] for obj in response['Contents']])
        
        if not response.get('NextContinuationToken'):
            break
        
        continuation_token = response['NextContinuationToken']
    
    return all_objects

In [22]:
def get_csv_files(bucket, prefix):
    all_files = list_all_objects(bucket, prefix)
    talent_csv_files = [file for file in all_files if file.endswith('.csv')]
    return talent_csv_files

In [23]:
def show_csv_content(bucket, file_key):
    # Retrieve the CSV file from S3
    response = s3.get_object(Bucket=bucket, Key=file_key)
    # Read the CSV file content
    csv_content = pd.read_csv(response['Body'])
    
    # Convert 'dob' column to datetime
    csv_content['dob'] = pd.to_datetime(csv_content['dob'], format='%d/%m/%Y', errors='coerce')
    
    # Convert 'month' column to datetime
    csv_content['month'] = pd.to_datetime(csv_content['month'], format='%B %Y', errors='coerce')
    
    # Format 'invited_date' with the correct ordinal suffix
    def ordinal_suffix(day):
        if 10 <= day % 100 <= 20:
            suffix = 'th'
        else:
            suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(day % 10, 'th')
        return str(day) + suffix

    csv_content['invited_date'] = csv_content['invited_date'].dropna().astype(int).apply(ordinal_suffix)
    
    print(csv_content)


bucket_name = 'data-402-final-project'
prefix = 'Talent/'

# Get all CSV files
talent_csv_files = get_csv_files(bucket_name, prefix)

In [24]:
# Print the list of CSV files
print("CSV files found:", talent_csv_files)

CSV files found: ['Talent/April2019Applicants.csv', 'Talent/Aug2019Applicants.csv', 'Talent/Dec2019Applicants.csv', 'Talent/Feb2019Applicants.csv', 'Talent/Jan2019Applicants.csv', 'Talent/July2019Applicants.csv', 'Talent/June2019Applicants.csv', 'Talent/March2019Applicants.csv', 'Talent/May2019Applicants.csv', 'Talent/Nov2019Applicants.csv', 'Talent/Oct2019Applicants.csv', 'Talent/Sept2019Applicants.csv']


In [25]:
# Show the content of one CSV file
if talent_csv_files:
    print("\nContents of the first CSV file:")
    show_csv_content(bucket_name, talent_csv_files[0])
else:
    print("No CSV files found.")


Contents of the first CSV file:
      id                 name  gender        dob  \
0      1       Esme Trusslove  Female 1994-08-04   
1      2      Matthaeus Audas    Male        NaT   
2      3      Cherey Tollfree  Female 1992-12-08   
3      4          Eryn Speers  Female        NaT   
4      5    Theadora Berkelay  Female 1995-11-03   
..   ...                  ...     ...        ...   
374  375  Pembroke Rheubottom    Male 1990-11-24   
375  376         Celle Barlas  Female 1994-11-08   
376  377           Scott Duny    Male 1995-03-19   
377  378         Conny Robson    Male 1993-11-23   
378  379    Boycey Matushenko    Male 1991-04-21   

                            email        city               address postcode  \
0           etrusslove0@google.es     Swindon  22056 Lerdahl Avenue      SN1   
1            maudas1@mapquest.com    Charlton      263 Nelson Trail     OX12   
2         ctollfree2@netvibes.com      Weston      69 Coleman Court     GU32   
3          espeers3@sh