In [1]:
!pip install boto3

import boto3
import pandas as pd
import json
import csv
import os
import re 

s3 = boto3.client('s3')

def list_all_objects(bucket, prefix):
    all_objects = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' in response:
            all_objects.extend([obj['Key'] for obj in response['Contents']])
        
        if not response.get('NextContinuationToken'):
            break
        
        continuation_token = response['NextContinuationToken']
    
    return all_objects

def list_txt_files(bucket, prefix):
    all_files = list_all_objects(bucket, prefix)
    txt_files = [file for file in all_files if file.endswith('.txt')]
    return txt_files

# Replace 'your-bucket-name' and 'your-prefix' with your bucket and prefix
bucket_name = 'data-402-final-project'
prefix = 'Talent/'

txt_files = list_txt_files(bucket_name, prefix)

print("List of .txt file titles:")
for txt_file in txt_files:
    print(txt_file)

def get_txt_file_contents(bucket, key):
    # Retrieve the object from S3
    response = s3.get_object(Bucket=bucket, Key=key)
    # Read the content of the file
    content = response['Body'].read().decode('utf-8')
    return content

# Replace 'your-bucket-name' with the name of your bucket
bucket_name = 'data-402-final-project'
# Replace 'your-file-key.txt' with the key of the file you want to read
file_key = 'Talent/Sparta Day 15 October 2019.txt'

# Get the contents of the specified .txt file
file_contents = get_txt_file_contents(bucket_name, file_key)

# Print the contents of the file
print(file_contents)

def parse_txt_content(content):
    lines = content.strip().split('\n')
    date = lines[0]
    academy = lines[1]
    data = []
    for line in lines[2:]:
        match = re.match(r'(.+?) -\s+Psychometrics:\s+(\d+)/100,\s+Presentation:\s+(\d+)/32', line)
        if match:
            name = match.group(1).strip().title()
            psychometrics = match.group(2).strip()
            presentation = match.group(3).strip()
            data.append([date, academy, name, psychometrics, presentation])
    return data

def combine_txt_files(bucket, prefix):
    txt_files = list_txt_files(bucket, prefix)
    all_data = []
    for txt_file in txt_files:
        content = get_txt_file_contents(bucket, txt_file)
        file_data = parse_txt_content(content)
        all_data.extend(file_data)
    
    df = pd.DataFrame(all_data, columns=['date', 'academy', 'name', 'psychometric_score', 'presentation_score'])
    return df

# Replace 'your-bucket-name' with the name of your bucket
bucket_name = 'data-402-final-project'
prefix = 'Talent/'

# Combine all txt files into a single DataFrame
talent_txt_files = combine_txt_files(bucket_name, prefix)

# Display the combined DataFrame
print(talent_txt_files)

# Save the combined DataFrame to a CSV file (optional)
talent_txt_files.to_csv('combined_data.csv', index=False)

List of .txt file titles:
Talent/Sparta Day 1 August 2019.txt
Talent/Sparta Day 1 May 2019.txt
Talent/Sparta Day 1 October 2019.txt
Talent/Sparta Day 10 April 2019.txt
Talent/Sparta Day 10 December 2019.txt
Talent/Sparta Day 10 January 2019.txt
Talent/Sparta Day 10 July 2019.txt
Talent/Sparta Day 10 October 2019.txt
Talent/Sparta Day 10 September 2019.txt
Talent/Sparta Day 11 April 2019.txt
Talent/Sparta Day 11 December 2019.txt
Talent/Sparta Day 11 July 2019.txt
Talent/Sparta Day 11 June 2019.txt
Talent/Sparta Day 11 September 2019.txt
Talent/Sparta Day 12 December 2019.txt
Talent/Sparta Day 12 February 2019.txt
Talent/Sparta Day 12 June 2019.txt
Talent/Sparta Day 12 March 2019.txt
Talent/Sparta Day 12 November 2019.txt
Talent/Sparta Day 12 September 2019.txt
Talent/Sparta Day 13 August 2019.txt
Talent/Sparta Day 13 February 2019.txt
Talent/Sparta Day 13 June 2019.txt
Talent/Sparta Day 13 March 2019.txt
Talent/Sparta Day 13 November 2019.txt
Talent/Sparta Day 14 August 2019.txt
Talent

In [2]:
# Clean the combined DataFrame
talent_txt_files['date'] = talent_txt_files['date'].str.strip()
talent_txt_files['academy'] = talent_txt_files['academy'].str.strip()
talent_txt_files['name'] = talent_txt_files['name'].str.strip()
talent_txt_files['psychometric_score'] = talent_txt_files['psychometric_score'].astype(int)
talent_txt_files['presentation_score'] = talent_txt_files['presentation_score'].astype(int)

# Display the combined DataFrame
print(talent_txt_files)

# Save the combined DataFrame to a CSV file (optional)
talent_txt_files.to_csv('talent_txt_files.csv', index=False)

                          date             academy              name  \
0       Thursday 1 August 2019  Birmingham Academy   Hilary Willmore   
1       Thursday 1 August 2019  Birmingham Academy       Orly Lorens   
2       Thursday 1 August 2019  Birmingham Academy    Alvie Bleackly   
3       Thursday 1 August 2019  Birmingham Academy     Deck Itzchaki   
4       Thursday 1 August 2019  Birmingham Academy      Wilt Penritt   
...                        ...                 ...               ...   
4129  Wednesday 9 October 2019      London Academy     Godiva Andrew   
4130  Wednesday 9 October 2019      London Academy    Doralia Gapper   
4131  Wednesday 9 October 2019      London Academy        Doe Eisold   
4132  Wednesday 9 October 2019      London Academy      Judy Finders   
4133  Wednesday 9 October 2019      London Academy  Lorinda O'Crotty   

      psychometric_score  presentation_score  
0                     51                  19  
1                     51                 

In [3]:
# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

In [4]:
# Check for missing values in the combined DataFrame
missing_values = talent_txt_files.isnull().sum()

# Print the missing values
print("Missing values in talent_txt_files:")
print(missing_values)

Missing values in talent_txt_files:
date                  0
academy               0
name                  0
psychometric_score    0
presentation_score    0
dtype: int64


In [5]:
# Convert the 'date' column to date format
talent_txt_files['date'] = pd.to_datetime(talent_txt_files['date']).dt.date

# Verify the changes
print(talent_txt_files.dtypes)


date                  object
academy               object
name                  object
psychometric_score     int32
presentation_score     int32
dtype: object


In [6]:
# Iterate through the DataFrame and print each row
for index, row in talent_txt_files.iterrows():
    print(f"{row['date']}  {row['academy']}  {row['name']}  {row['psychometric_score']}  {row['presentation_score']}")

2019-08-01  Birmingham Academy  Hilary Willmore  51  19
2019-08-01  Birmingham Academy  Orly Lorens  51  19
2019-08-01  Birmingham Academy  Alvie Bleackly  55  16
2019-08-01  Birmingham Academy  Deck Itzchaki  59  21
2019-08-01  Birmingham Academy  Wilt Penritt  66  25
2019-08-01  Birmingham Academy  Gianna Clevely  61  16
2019-08-01  Birmingham Academy  Yvor Phalip  59  18
2019-08-01  Birmingham Academy  Marielle Knivett  52  18
2019-08-01  Birmingham Academy  Florri Loughton  63  28
2019-08-01  Birmingham Academy  Walden Gidden  49  25
2019-08-01  Birmingham Academy  Ingunna Adin  60  20
2019-08-01  Birmingham Academy  Janaya Mawford  52  18
2019-08-01  Birmingham Academy  Cornall Offer  56  26
2019-08-01  Birmingham Academy  Brnaby Bownes  59  26
2019-08-01  Birmingham Academy  Sherline Cudihy  52  16
2019-08-01  Birmingham Academy  Anallise Scathard  62  13
2019-08-01  Birmingham Academy  Benson Labuschagne  64  13
2019-08-01  Birmingham Academy  Norene Fierro  58  22
2019-08-01  B