In [38]:
!pip install boto3



In [103]:
import boto3
import pandas as pd
import json
import csv
import os
import re 

In [40]:
s3 = boto3.client('s3')

In [41]:
def list_objects(bucket, prefix):
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    if 'Contents' in response:
        return [obj['Key'] for obj in response['Contents'] if obj['Key'] != prefix]
    return []

In [42]:
def list_all_objects(bucket, prefix):
    all_objects = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' in response:
            all_objects.extend([obj['Key'] for obj in response['Contents']])
        
        if not response.get('NextContinuationToken'):
            break
        
        continuation_token = response['NextContinuationToken']
    
    return all_objects

In [44]:
def get_file_types(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    file_types = {}
    for file in files:
        ext = os.path.splitext(file)[1].lower()
        if ext in file_types:
            file_types[ext] += 1
        else:
            file_types[ext] = 1
    return file_types

In [45]:
def print_file_types(bucket, prefix, name):
    file_types = get_file_types(bucket, prefix)
    print(f"{name}")
    for ext, count in file_types.items():
        print(f"{ext.upper()} files: {count}")
    other_files = sum([count for ext, count in file_types.items() if ext not in ['.csv', '.json', '.txt']])
    print(f"Other file types: {other_files}")
    print()

In [46]:
def load_academy_data(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    data_frames = []
    for file_key in files:
        obj = s3.get_object(Bucket=bucket, Key=file_key)
        df = pd.read_csv(obj['Body'])
        data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

In [47]:
def load_talent_data(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    records = []
    for file_key in files:
        obj = s3.get_object(Bucket=bucket, Key=file_key)
        content = obj['Body'].read().decode('utf-8')
        try:
            # Attempt to load the content as JSON
            data = json.loads(content)
            records.append(data)
        except json.JSONDecodeError:
            # If content is not JSON, try to parse as CSV
            try:
                csv_reader = csv.DictReader(content.splitlines())
                # Convert CSV rows to dicts & append to records list
                for row in csv_reader:
                    records.append(row)
            except Exception as e:
                print(f"Error loading file {file_key}: {e}")
                # Handle the error appropriately,-> skip the file or handle it differently
    return pd.DataFrame(records)

In [48]:
def list_columns(bucket, prefix, name):
    files = list_all_objects(bucket, prefix)
    columns = {}
    for file_key in files:
        obj = s3.get_object(Bucket=bucket, Key=file_key)
        ext = os.path.splitext(file_key)[1].lower()
        if ext == '.csv':
            df = pd.read_csv(obj['Body'])
            columns[ext] = list(df.columns)
        elif ext == '.json':
            content = obj['Body'].read().decode('utf-8')
            data = json.loads(content)
            if isinstance(data, list):
                columns[ext] = list(data[0].keys())
            elif isinstance(data, dict):
                columns[ext] = list(data.keys())
        elif ext == '.txt':
            content = obj['Body'].read().decode('utf-8')
            first_line = content.splitlines()[0]
            columns[ext] = first_line.split()
        else:
            columns[ext] = ['no columns']
    
    print(f"{name}")
    for ext, cols in columns.items():
        print(f"{ext.upper()} files columns: {', '.join(cols) if cols else 'no columns'}")
    print()

In [49]:
print_file_types('data-402-final-project', 'Academy/', 'Academy')
print_file_types('data-402-final-project', 'Talent/', 'Talent')

Academy
.CSV files: 36
Other file types: 0

Talent
.JSON files: 3105
.CSV files: 12
.TXT files: 152
Other file types: 0



In [50]:
list_columns('data-402-final-project', 'Academy/', 'Academy')
list_columns('data-402-final-project', 'Talent/', 'Talent')

Academy
.CSV files columns: name, trainer, Analytic_W1, Independent_W1, Determined_W1, Professional_W1, Studious_W1, Imaginative_W1, Analytic_W2, Independent_W2, Determined_W2, Professional_W2, Studious_W2, Imaginative_W2, Analytic_W3, Independent_W3, Determined_W3, Professional_W3, Studious_W3, Imaginative_W3, Analytic_W4, Independent_W4, Determined_W4, Professional_W4, Studious_W4, Imaginative_W4, Analytic_W5, Independent_W5, Determined_W5, Professional_W5, Studious_W5, Imaginative_W5, Analytic_W6, Independent_W6, Determined_W6, Professional_W6, Studious_W6, Imaginative_W6, Analytic_W7, Independent_W7, Determined_W7, Professional_W7, Studious_W7, Imaginative_W7, Analytic_W8, Independent_W8, Determined_W8, Professional_W8, Studious_W8, Imaginative_W8

Talent
.JSON files columns: name, date, tech_self_score, strengths, weaknesses, self_development, geo_flex, financial_support_self, result, course_interest
.CSV files columns: id, name, gender, dob, email, city, address, postcode, phone_

In [51]:
academy_data = load_academy_data('data-402-final-project', 'Academy/')
talent_data = load_talent_data('data-402-final-project', 'Talent/')

In [52]:
# Extract unique names from academy_data and talent_data
academy_names = set(academy_data['name'])
talent_names = set(talent_data['name'])

# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [53]:
print("Dimensions of Academy Data:")
print(f"Number of rows: {academy_data.shape[0]}")
print(f"Number of columns: {academy_data.shape[1]}")

Dimensions of Academy Data:
Number of rows: 397
Number of columns: 62


In [54]:
print("\nDimensions of Talent Data:")
print(f"Number of rows: {talent_data.shape[0]}")
print(f"Number of columns: {talent_data.shape[1]}")


Dimensions of Talent Data:
Number of rows: 12082
Number of columns: 176


In [55]:
print("Academy Data:")
print(academy_data.head())

Academy Data:
              name       trainer  Analytic_W1  Independent_W1  Determined_W1  \
0  Quintus Penella  Gregor Gomez            1               2              2   
1     Simon Murrey  Gregor Gomez            6               1              1   
2      Gustaf Lude  Gregor Gomez            6               4              1   
3    Yolanda Fosse  Gregor Gomez            2               1              2   
4     Lynnett Swin  Gregor Gomez            2               2              4   

   Professional_W1  Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2  \
0                1            2               2          NaN             NaN   
1                2            4               2          3.0             1.0   
2                1            2               3          1.0             1.0   
3                3            3               3          4.0             2.0   
4                5            1               2          3.0             2.0   

   ...  Determined_W9  P

In [56]:
print("\nMissing values in Academy Data:")
print(academy_data.isnull().sum())



Missing values in Academy Data:
name                  0
trainer               0
Analytic_W1           0
Independent_W1        0
Determined_W1         0
                   ... 
Independent_W10     235
Determined_W10      235
Professional_W10    235
Studious_W10        235
Imaginative_W10     235
Length: 62, dtype: int64


In [57]:
academy_data_filled = academy_data

In [58]:
print("\nMissing values in Talent Data:")
print(talent_data.isnull().sum())


Missing values in Talent Data:
name                         4286
date                         8977
tech_self_score              9032
strengths                    8977
weaknesses                   8977
                            ...  
Tuesday 9 April 2019        12047
Wednesday 9 January 2019    12060
Tuesday 9 July 2019         12042
Thursday 9 May 2019         12057
Wednesday 9 October 2019    12061
Length: 176, dtype: int64


In [59]:
talent_data_filled = talent_data.fillna({
    'tech_self_score': '{}', 
    'strengths': '[]', 
    'weaknesses': '[]',
    'self_development': 'No', 
    'geo_flex': 'No', 
    'financial_support_self': 'No', 
    'course_interest': 'None'
})

In [60]:
print("\nMissing values in Talent Data:")
print(talent_data.isnull().sum())


Missing values in Talent Data:
name                         4286
date                         8977
tech_self_score              9032
strengths                    8977
weaknesses                   8977
                            ...  
Tuesday 9 April 2019        12047
Wednesday 9 January 2019    12060
Tuesday 9 July 2019         12042
Thursday 9 May 2019         12057
Wednesday 9 October 2019    12061
Length: 176, dtype: int64


In [61]:
# Extract unique names from academy_data and talent_data
academy_names = set(academy_data_filled
['name'])
talent_names = set(talent_data_filled
['name'])

# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [62]:
# Convert dicts and lists to strings in specified columns
talent_data_filled['tech_self_score'] = talent_data_filled['tech_self_score'].apply(json.dumps)
talent_data_filled['strengths'] = talent_data_filled['strengths'].apply(json.dumps)
talent_data_filled['weaknesses'] = talent_data_filled['weaknesses'].apply(json.dumps)

In [63]:
# Extract unique names from academy_data and talent_data
academy_names = set(academy_data_filled['name'])
talent_names = set(talent_data_filled['name'])

# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [64]:
# Convert back

import ast

talent_data_filled['tech_self_score'] = talent_data_filled['tech_self_score'].apply(ast.literal_eval)
talent_data_filled['strengths'] = talent_data_filled['strengths'].apply(ast.literal_eval)
talent_data_filled['weaknesses'] = talent_data_filled['weaknesses'].apply(ast.literal_eval)

print("Data types after converting JSON strings back to lists/dictionaries:")
print(talent_data_filled.dtypes)

Data types after converting JSON strings back to lists/dictionaries:
name                        object
date                        object
tech_self_score             object
strengths                   object
weaknesses                  object
                             ...  
Tuesday 9 April 2019        object
Wednesday 9 January 2019    object
Tuesday 9 July 2019         object
Thursday 9 May 2019         object
Wednesday 9 October 2019    object
Length: 176, dtype: object


In [65]:
# Extract unique names from academy_data and talent_data
academy_names = set(academy_data_filled['name'])
talent_names = set(talent_data_filled['name'])

In [66]:
# Extract unique names from academy_data and talent_data
academy_names = set(academy_data_filled['name'])
talent_names = set(talent_data_filled['name'])

# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [67]:
print(academy_data_filled.dtypes)

name                 object
trainer              object
Analytic_W1           int64
Independent_W1        int64
Determined_W1         int64
                     ...   
Independent_W10     float64
Determined_W10      float64
Professional_W10    float64
Studious_W10        float64
Imaginative_W10     float64
Length: 62, dtype: object


In [68]:
print(talent_data_filled.dtypes)

name                        object
date                        object
tech_self_score             object
strengths                   object
weaknesses                  object
                             ...  
Tuesday 9 April 2019        object
Wednesday 9 January 2019    object
Tuesday 9 July 2019         object
Thursday 9 May 2019         object
Wednesday 9 October 2019    object
Length: 176, dtype: object


In [69]:
for column in talent_data_filled.columns:
    print(f"Unique values in column '{column}':")
    try:
        print(talent_data_filled[column].unique())
    except Exception as e:
        print(f"Error occurred while processing column '{column}': {e}")
    print()

Unique values in column 'name':
['Stillmann Castano' 'Hilary Willmore' 'Efrem Whipple' ...
 'Vivianna Letty' 'Mercie Groger' nan]

Unique values in column 'date':
['22/08/2019' '01/08/2019' '28/08/2019' '07/08/2019' '14/08/2019'
 '29/08/2019' '21/08/2019' '08/08/2019' '15/08/2019' '06/08/2019'
 '13/08/2019' '27/08/2019' '20/08/2019' '13//08/2019' '28//08/2019'
 '18/07/2019' '31/07/2019' '10/07/2019' '23/07/2019' '09/07/2019'
 '16/07/2019' '17/07/2019' '24/07/2019' '25/07/2019' '11/07/2019'
 '02/07/2019' '03/07/2019' '30/07/2019' '04/07/2019' '11//07/2019'
 '17//07/2019' '12/07/2019' '20/07/2019' '06/07/2019' '27/07/2019'
 '05/07/2019' '13/07/2019' '26/07/2019' '19/07/2019' '05//07/2019'
 '25//07/2019' '13//07/2019' '05//12/2019' '10/12/2019' '17/12/2019'
 '18/12/2019' '05/12/2019' '11/12/2019' '04/12/2019' '12/12/2019'
 '19/12/2019' '11//12/2019' '03/12/2019' '18//12/2019' '28/02/2019'
 '13/02/2019' '19/02/2019' '06/02/2019' '12/02/2019' '27/02/2019'
 '20/02/2019' '07/02/2019' '14/02/2

In [70]:
print("\nSummary statistics for Academy Data:")
print(academy_data_filled.describe())


Summary statistics for Academy Data:
       Analytic_W1  Independent_W1  Determined_W1  Professional_W1  \
count   397.000000      397.000000     397.000000       397.000000   
mean      3.120907        3.173804       2.992443         2.957179   
std       1.696845        1.597826       1.548033         1.605131   
min       1.000000        1.000000       1.000000         1.000000   
25%       2.000000        2.000000       2.000000         2.000000   
50%       3.000000        3.000000       3.000000         3.000000   
75%       4.000000        4.000000       4.000000         4.000000   
max       8.000000        8.000000       8.000000         8.000000   

       Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2  \
count   397.000000      397.000000   387.000000      387.000000   
mean      2.921914        3.128463     2.974160        3.038760   
std       1.491203        1.625591     1.594795        1.720359   
min       1.000000        1.000000     1.000000        1.000000

In [71]:
print("\nSummary statistics for Talent Data:")
print(talent_data_filled.describe())


Summary statistics for Talent Data:
                 name        date tech_self_score strengths weaknesses  \
count            7796        3105           12082     12082      12082   
unique           4836         172            2573      1532       1447   
top     Aggi Shilling  18/07/2019              {}        []         []   
freq                3          70            9032      8977       8977   

       self_development geo_flex financial_support_self result  \
count             12082    12082                  12082   3105   
unique                2        2                      2      2   
top                  No       No                     No   Pass   
freq               9282     9304                   9323   2001   

       course_interest  ... Thursday 7 November 2019 Thursday 8 August 2019  \
count            12082  ...                       30                     35   
unique               4  ...                       30                     35   
top               None  

In [72]:
# Merge academy_data_filled and talent_data_filled on the 'name' column
merged_data = pd.merge(academy_data_filled, talent_data_filled, on='name', how='inner')

In [73]:
print("\nMerged Data:")
print(merged_data.head())


Merged Data:
              name       trainer  Analytic_W1  Independent_W1  Determined_W1  \
0  Quintus Penella  Gregor Gomez            1               2              2   
1  Quintus Penella  Gregor Gomez            1               2              2   
2     Simon Murrey  Gregor Gomez            6               1              1   
3     Simon Murrey  Gregor Gomez            6               1              1   
4      Gustaf Lude  Gregor Gomez            6               4              1   

   Professional_W1  Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2  \
0                1            2               2          NaN             NaN   
1                1            2               2          NaN             NaN   
2                2            4               2          3.0             1.0   
3                2            4               2          3.0             1.0   
4                1            2               3          1.0             1.0   

   ...  Thursday 7 Novem

In [76]:
# Extract column names from both academy_data and talent_data
academy_columns = list(academy_data.columns)
talent_columns = list(talent_data.columns)

In [77]:
# Test search for a user
simon_murrey_records = merged_data[merged_data['name'] == 'Simon Murrey']
simon_murrey_records_all_columns = merged_data.loc[merged_data['name'] == 'Simon Murrey']
# Filtered records for Simon Murrey
print("Filtered records for Simon Murrey:")
print(simon_murrey_records)

# All columns for Simon Murrey
print("\nAll columns for Simon Murrey:")
print(simon_murrey_records_all_columns)

Filtered records for Simon Murrey:
           name       trainer  Analytic_W1  Independent_W1  Determined_W1  \
2  Simon Murrey  Gregor Gomez            6               1              1   
3  Simon Murrey  Gregor Gomez            6               1              1   

   Professional_W1  Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2  \
2                2            4               2          3.0             1.0   
3                2            4               2          3.0             1.0   

   ...  Thursday 7 November 2019  Thursday 8 August 2019  \
2  ...                       NaN                     NaN   
3  ...                       NaN                     NaN   

   Tuesday 8 January 2019  Wednesday 8 May 2019  Tuesday 8 October 2019  \
2                     NaN                   NaN                     NaN   
3                     NaN                   NaN                     NaN   

   Tuesday 9 April 2019  Wednesday 9 January 2019  Tuesday 9 July 2019  \
2          

In [78]:
# Test search for a user
gregor_gomez_records = merged_data[merged_data['name'] == 'Gregor Gomez']
gregor_gomez_records_all_columns = merged_data.loc[merged_data['name'] == 'Gregor Gomez']
# Filtered records for Gregor Gomez
print("Filtered records for Gregor Gomez:")
print(gregor_gomez_records)

# All columns for Gregor Gomez
print("\nAll columns for Gregor Gomez:")
print(gregor_gomez_records_all_columns)

# Will return empty df as this is a trainer who will not be in some of the other tables 

Filtered records for Gregor Gomez:
Empty DataFrame
Columns: [name, trainer, Analytic_W1, Independent_W1, Determined_W1, Professional_W1, Studious_W1, Imaginative_W1, Analytic_W2, Independent_W2, Determined_W2, Professional_W2, Studious_W2, Imaginative_W2, Analytic_W3, Independent_W3, Determined_W3, Professional_W3, Studious_W3, Imaginative_W3, Analytic_W4, Independent_W4, Determined_W4, Professional_W4, Studious_W4, Imaginative_W4, Analytic_W5, Independent_W5, Determined_W5, Professional_W5, Studious_W5, Imaginative_W5, Analytic_W6, Independent_W6, Determined_W6, Professional_W6, Studious_W6, Imaginative_W6, Analytic_W7, Independent_W7, Determined_W7, Professional_W7, Studious_W7, Imaginative_W7, Analytic_W8, Independent_W8, Determined_W8, Professional_W8, Studious_W8, Imaginative_W8, Analytic_W9, Independent_W9, Determined_W9, Professional_W9, Studious_W9, Imaginative_W9, Analytic_W10, Independent_W10, Determined_W10, Professional_W10, Studious_W10, Imaginative_W10, date, tech_self_sc

In [91]:
# Test search for a user
luke_able_records = merged_data[merged_data['name'] == 'LUKE ABLE']
luke_able_records_all_columns = merged_data.loc[merged_data['name'] == 'LUKE ABLE']
# Filtered records for LUKE ABLE
print("Filtered records for LUKE ABLE:")
print(luke_able_records)

# All columns for LUKE ABLE
print("\nAll columns for LUKE ABLE:")
print(luke_able_records_all_columns)

# Will return empty df as this is a trainer who will not be in some of the other tables 


#################################################################
# Test search for a user
luke_able_records = merged_data[merged_data['name'] == 'Luke Able']
luke_able_records_all_columns = merged_data.loc[merged_data['name'] == 'Luke Able']
# Filtered records for LUKE ABLE
print("Filtered records for Luke Able:")
print(luke_able_records)

# All columns for LUKE ABLE
print("\nAll columns for Luke Able:")
print(luke_able_records_all_columns)

# Will return empty df as this is a trainer who will not be in some of the other tables 

Filtered records for LUKE ABLE:
Empty DataFrame
Columns: [name, trainer, Analytic_W1, Independent_W1, Determined_W1, Professional_W1, Studious_W1, Imaginative_W1, Analytic_W2, Independent_W2, Determined_W2, Professional_W2, Studious_W2, Imaginative_W2, Analytic_W3, Independent_W3, Determined_W3, Professional_W3, Studious_W3, Imaginative_W3, Analytic_W4, Independent_W4, Determined_W4, Professional_W4, Studious_W4, Imaginative_W4, Analytic_W5, Independent_W5, Determined_W5, Professional_W5, Studious_W5, Imaginative_W5, Analytic_W6, Independent_W6, Determined_W6, Professional_W6, Studious_W6, Imaginative_W6, Analytic_W7, Independent_W7, Determined_W7, Professional_W7, Studious_W7, Imaginative_W7, Analytic_W8, Independent_W8, Determined_W8, Professional_W8, Studious_W8, Imaginative_W8, Analytic_W9, Independent_W9, Determined_W9, Professional_W9, Studious_W9, Imaginative_W9, Analytic_W10, Independent_W10, Determined_W10, Professional_W10, Studious_W10, Imaginative_W10, date, tech_self_score

In [79]:
print(merged_data.dtypes)

name                        object
trainer                     object
Analytic_W1                  int64
Independent_W1               int64
Determined_W1                int64
                             ...  
Tuesday 9 April 2019        object
Wednesday 9 January 2019    object
Tuesday 9 July 2019         object
Thursday 9 May 2019         object
Wednesday 9 October 2019    object
Length: 237, dtype: object


In [80]:
# List all column names 
print("Column names in Merged Data:")
print(merged_data.columns.tolist())

Column names in Merged Data:
['name', 'trainer', 'Analytic_W1', 'Independent_W1', 'Determined_W1', 'Professional_W1', 'Studious_W1', 'Imaginative_W1', 'Analytic_W2', 'Independent_W2', 'Determined_W2', 'Professional_W2', 'Studious_W2', 'Imaginative_W2', 'Analytic_W3', 'Independent_W3', 'Determined_W3', 'Professional_W3', 'Studious_W3', 'Imaginative_W3', 'Analytic_W4', 'Independent_W4', 'Determined_W4', 'Professional_W4', 'Studious_W4', 'Imaginative_W4', 'Analytic_W5', 'Independent_W5', 'Determined_W5', 'Professional_W5', 'Studious_W5', 'Imaginative_W5', 'Analytic_W6', 'Independent_W6', 'Determined_W6', 'Professional_W6', 'Studious_W6', 'Imaginative_W6', 'Analytic_W7', 'Independent_W7', 'Determined_W7', 'Professional_W7', 'Studious_W7', 'Imaginative_W7', 'Analytic_W8', 'Independent_W8', 'Determined_W8', 'Professional_W8', 'Studious_W8', 'Imaginative_W8', 'Analytic_W9', 'Independent_W9', 'Determined_W9', 'Professional_W9', 'Studious_W9', 'Imaginative_W9', 'Analytic_W10', 'Independent_W10'

In [81]:
# Check for missing values
print("Missing values in each column:")
print(merged_data.isnull().sum())

Missing values in each column:
name                          0
trainer                       0
Analytic_W1                   0
Independent_W1                0
Determined_W1                 0
                           ... 
Tuesday 9 April 2019        775
Wednesday 9 January 2019    775
Tuesday 9 July 2019         775
Thursday 9 May 2019         775
Wednesday 9 October 2019    775
Length: 237, dtype: int64


In [82]:
# Check data types
print("Data types of each column:")
print(merged_data.dtypes)

Data types of each column:
name                        object
trainer                     object
Analytic_W1                  int64
Independent_W1               int64
Determined_W1                int64
                             ...  
Tuesday 9 April 2019        object
Wednesday 9 January 2019    object
Tuesday 9 July 2019         object
Thursday 9 May 2019         object
Wednesday 9 October 2019    object
Length: 237, dtype: object


In [83]:
def check_merged_columns(merged_data, academy_columns, talent_columns):
    all_columns = set(academy_columns).union(set(talent_columns))
    missing_columns = all_columns.difference(set(merged_data.columns))
    
    if missing_columns:
        print("The following columns are missing from the merged data:")
        print(", ".join(missing_columns))
    else:
        print("All columns are present in the merged data.")

In [84]:
# Check if all columns are present in the merged dataframe
check_merged_columns(merged_data, academy_columns, talent_columns)

All columns are present in the merged data.


In [92]:
wednesday_oct_9_2019_data = merged_data.loc[:, 'Wednesday 9 October 2019']
print(wednesday_oct_9_2019_data)

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
770    NaN
771    NaN
772    NaN
773    NaN
774    NaN
Name: Wednesday 9 October 2019, Length: 775, dtype: object


In [94]:
print(wednesday_oct_9_2019_data.to_frame().columns)


Index(['Wednesday 9 October 2019'], dtype='object')


In [97]:
# Boolean indexing to filter rows where 'Wednesday 9 October 2019' is not NaN
rows_with_values = merged_data[merged_data['Wednesday 9 October 2019'].notna()]

# Display the filtered rows
print(rows_with_values)


Empty DataFrame
Columns: [name, trainer, Analytic_W1, Independent_W1, Determined_W1, Professional_W1, Studious_W1, Imaginative_W1, Analytic_W2, Independent_W2, Determined_W2, Professional_W2, Studious_W2, Imaginative_W2, Analytic_W3, Independent_W3, Determined_W3, Professional_W3, Studious_W3, Imaginative_W3, Analytic_W4, Independent_W4, Determined_W4, Professional_W4, Studious_W4, Imaginative_W4, Analytic_W5, Independent_W5, Determined_W5, Professional_W5, Studious_W5, Imaginative_W5, Analytic_W6, Independent_W6, Determined_W6, Professional_W6, Studious_W6, Imaginative_W6, Analytic_W7, Independent_W7, Determined_W7, Professional_W7, Studious_W7, Imaginative_W7, Analytic_W8, Independent_W8, Determined_W8, Professional_W8, Studious_W8, Imaginative_W8, Analytic_W9, Independent_W9, Determined_W9, Professional_W9, Studious_W9, Imaginative_W9, Analytic_W10, Independent_W10, Determined_W10, Professional_W10, Studious_W10, Imaginative_W10, date, tech_self_score, strengths, weaknesses, self_de

In [98]:
thursday_nov_7_2019_data = merged_data.dropna(subset=['Thursday 7 November 2019'])
print(thursday_nov_7_2019_data)


Empty DataFrame
Columns: [name, trainer, Analytic_W1, Independent_W1, Determined_W1, Professional_W1, Studious_W1, Imaginative_W1, Analytic_W2, Independent_W2, Determined_W2, Professional_W2, Studious_W2, Imaginative_W2, Analytic_W3, Independent_W3, Determined_W3, Professional_W3, Studious_W3, Imaginative_W3, Analytic_W4, Independent_W4, Determined_W4, Professional_W4, Studious_W4, Imaginative_W4, Analytic_W5, Independent_W5, Determined_W5, Professional_W5, Studious_W5, Imaginative_W5, Analytic_W6, Independent_W6, Determined_W6, Professional_W6, Studious_W6, Imaginative_W6, Analytic_W7, Independent_W7, Determined_W7, Professional_W7, Studious_W7, Imaginative_W7, Analytic_W8, Independent_W8, Determined_W8, Professional_W8, Studious_W8, Imaginative_W8, Analytic_W9, Independent_W9, Determined_W9, Professional_W9, Studious_W9, Imaginative_W9, Analytic_W10, Independent_W10, Determined_W10, Professional_W10, Studious_W10, Imaginative_W10, date, tech_self_score, strengths, weaknesses, self_de

In [99]:
print("test")

test


# TXT FILE WORK

In [None]:
## TXT FILE EXPLORATION #############################################

In [100]:
def list_txt_files(bucket, prefix):
    all_files = list_all_objects(bucket, prefix)
    txt_files = [file for file in all_files if file.endswith('.txt')]
    return txt_files

# Replace 'your-bucket-name' and 'your-prefix' with your bucket and prefix
bucket_name = 'data-402-final-project'
prefix = 'Talent/'

txt_files = list_txt_files(bucket_name, prefix)

print("List of .txt file titles:")
for txt_file in txt_files:
    print(txt_file)

List of .txt file titles:
Talent/Sparta Day 1 August 2019.txt
Talent/Sparta Day 1 May 2019.txt
Talent/Sparta Day 1 October 2019.txt
Talent/Sparta Day 10 April 2019.txt
Talent/Sparta Day 10 December 2019.txt
Talent/Sparta Day 10 January 2019.txt
Talent/Sparta Day 10 July 2019.txt
Talent/Sparta Day 10 October 2019.txt
Talent/Sparta Day 10 September 2019.txt
Talent/Sparta Day 11 April 2019.txt
Talent/Sparta Day 11 December 2019.txt
Talent/Sparta Day 11 July 2019.txt
Talent/Sparta Day 11 June 2019.txt
Talent/Sparta Day 11 September 2019.txt
Talent/Sparta Day 12 December 2019.txt
Talent/Sparta Day 12 February 2019.txt
Talent/Sparta Day 12 June 2019.txt
Talent/Sparta Day 12 March 2019.txt
Talent/Sparta Day 12 November 2019.txt
Talent/Sparta Day 12 September 2019.txt
Talent/Sparta Day 13 August 2019.txt
Talent/Sparta Day 13 February 2019.txt
Talent/Sparta Day 13 June 2019.txt
Talent/Sparta Day 13 March 2019.txt
Talent/Sparta Day 13 November 2019.txt
Talent/Sparta Day 14 August 2019.txt
Talent

In [101]:
def get_txt_file_contents(bucket, key):
    # Retrieve the object from S3
    response = s3.get_object(Bucket=bucket, Key=key)
    # Read the content of the file
    content = response['Body'].read().decode('utf-8')
    return content

# Replace 'your-bucket-name' with the name of your bucket
bucket_name = 'data-402-final-project'
# Replace 'your-file-key.txt' with the key of the file you want to read
file_key = 'Talent/Sparta Day 15 October 2019.txt'

# Get the contents of the specified .txt file
file_contents = get_txt_file_contents(bucket_name, file_key)

# Print the contents of the file
print(file_contents)


Tuesday 15 October 2019
London Academy

IVOR QUOGAN -  Psychometrics: 39/100, Presentation: 16/32
THIBAUT EDBROOKE -  Psychometrics: 49/100, Presentation: 13/32
CHANCEY BITTEN -  Psychometrics: 50/100, Presentation: 18/32
NOLAND ORIGIN -  Psychometrics: 55/100, Presentation: 20/32
CORRENA SUTHEREL -  Psychometrics: 51/100, Presentation: 26/32
DORETTA HALLOR -  Psychometrics: 54/100, Presentation: 20/32
ADDI LARTER -  Psychometrics: 43/100, Presentation: 20/32
TIERNEY TWATT -  Psychometrics: 51/100, Presentation: 19/32
WILEEN GARTENFELD -  Psychometrics: 54/100, Presentation: 20/32
ILEANE MARGARSON -  Psychometrics: 63/100, Presentation: 23/32
MATEO HANBURRY -  Psychometrics: 58/100, Presentation: 21/32
HOGAN RAFFAN -  Psychometrics: 53/100, Presentation: 27/32
CREIGHTON ESTRELLA -  Psychometrics: 59/100, Presentation: 21/32
BRYNA DELLE -  Psychometrics: 51/100, Presentation: 20/32
CRISTY RUTLEDGE -  Psychometrics: 51/100, Presentation: 23/32
CODI KEAR -  Psychometrics: 64/100, Presenta

In [105]:
def parse_txt_content(content):
    lines = content.strip().split('\n')
    date = lines[0]
    academy = lines[1]
    data = []
    for line in lines[2:]:
        match = re.match(r'(.+?) -\s+Psychometrics:\s+(\d+)/100,\s+Presentation:\s+(\d+)/32', line)
        if match:
            name = match.group(1).strip().title()
            psychometrics = match.group(2).strip()
            presentation = match.group(3).strip()
            data.append([date, academy, name, psychometrics, presentation])
    return data

def combine_txt_files(bucket, prefix):
    txt_files = list_txt_files(bucket, prefix)
    all_data = []
    for txt_file in txt_files:
        content = get_txt_file_contents(bucket, txt_file)
        file_data = parse_txt_content(content)
        all_data.extend(file_data)
    
    df = pd.DataFrame(all_data, columns=['date', 'academy', 'name', 'psychometric_score', 'presentation_score'])
    return df

# Replace 'your-bucket-name' with the name of your bucket
bucket_name = 'data-402-final-project'
prefix = 'Talent/'

# Combine all txt files into a single DataFrame
combined_df = combine_txt_files(bucket_name, prefix)

# Display the combined DataFrame
print(combined_df)

# Save the combined DataFrame to a CSV file (optional)
combined_df.to_csv('combined_data.csv', index=False)


                            date               academy              name  \
0       Thursday 1 August 2019\r  Birmingham Academy\r   Hilary Willmore   
1       Thursday 1 August 2019\r  Birmingham Academy\r       Orly Lorens   
2       Thursday 1 August 2019\r  Birmingham Academy\r    Alvie Bleackly   
3       Thursday 1 August 2019\r  Birmingham Academy\r     Deck Itzchaki   
4       Thursday 1 August 2019\r  Birmingham Academy\r      Wilt Penritt   
...                          ...                   ...               ...   
4129  Wednesday 9 October 2019\r      London Academy\r     Godiva Andrew   
4130  Wednesday 9 October 2019\r      London Academy\r    Doralia Gapper   
4131  Wednesday 9 October 2019\r      London Academy\r        Doe Eisold   
4132  Wednesday 9 October 2019\r      London Academy\r      Judy Finders   
4133  Wednesday 9 October 2019\r      London Academy\r  Lorinda O'Crotty   

     psychometric_score presentation_score  
0                    51                 19

In [106]:
# Filter the combined DataFrame for rows where the 'name' column is 'Doralia Gapper'
doralia_gapper_df = combined_df[combined_df['name'] == 'Doralia Gapper']

# Display the filtered DataFrame
print(doralia_gapper_df)


                            date           academy            name  \
4130  Wednesday 9 October 2019\r  London Academy\r  Doralia Gapper   

     psychometric_score presentation_score  
4130                 55                 22  


In [107]:
# Clean the combined DataFrame
combined_df['date'] = combined_df['date'].str.strip()
combined_df['academy'] = combined_df['academy'].str.strip()
combined_df['name'] = combined_df['name'].str.strip()
combined_df['psychometric_score'] = combined_df['psychometric_score'].astype(int)
combined_df['presentation_score'] = combined_df['presentation_score'].astype(int)

# Display the combined DataFrame
print(combined_df)

# Save the combined DataFrame to a CSV file (optional)
combined_df.to_csv('combined_data.csv', index=False)

# Filter the DataFrame for rows where the 'name' column is 'Doralia Gapper'
doralia_gapper_df = combined_df[combined_df['name'].str.lower() == 'doralia gapper'.lower()]

# Display the filtered DataFrame
print(doralia_gapper_df)

                          date             academy              name  \
0       Thursday 1 August 2019  Birmingham Academy   Hilary Willmore   
1       Thursday 1 August 2019  Birmingham Academy       Orly Lorens   
2       Thursday 1 August 2019  Birmingham Academy    Alvie Bleackly   
3       Thursday 1 August 2019  Birmingham Academy     Deck Itzchaki   
4       Thursday 1 August 2019  Birmingham Academy      Wilt Penritt   
...                        ...                 ...               ...   
4129  Wednesday 9 October 2019      London Academy     Godiva Andrew   
4130  Wednesday 9 October 2019      London Academy    Doralia Gapper   
4131  Wednesday 9 October 2019      London Academy        Doe Eisold   
4132  Wednesday 9 October 2019      London Academy      Judy Finders   
4133  Wednesday 9 October 2019      London Academy  Lorinda O'Crotty   

      psychometric_score  presentation_score  
0                     51                  19  
1                     51                 

In [108]:
# Set pandas display options to show more columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

# Display the filtered DataFrame for 'Doralia Gapper'
print(doralia_gapper_df)


                          date         academy            name  psychometric_score  presentation_score
4130  Wednesday 9 October 2019  London Academy  Doralia Gapper                  55                  22


In [109]:
# Iterate through the DataFrame and print each row
for index, row in combined_df.iterrows():
    print(f"{row['date']}  {row['academy']}  {row['name']}  {row['psychometric_score']}  {row['presentation_score']}")


Thursday 1 August 2019  Birmingham Academy  Hilary Willmore  51  19
Thursday 1 August 2019  Birmingham Academy  Orly Lorens  51  19
Thursday 1 August 2019  Birmingham Academy  Alvie Bleackly  55  16
Thursday 1 August 2019  Birmingham Academy  Deck Itzchaki  59  21
Thursday 1 August 2019  Birmingham Academy  Wilt Penritt  66  25
Thursday 1 August 2019  Birmingham Academy  Gianna Clevely  61  16
Thursday 1 August 2019  Birmingham Academy  Yvor Phalip  59  18
Thursday 1 August 2019  Birmingham Academy  Marielle Knivett  52  18
Thursday 1 August 2019  Birmingham Academy  Florri Loughton  63  28
Thursday 1 August 2019  Birmingham Academy  Walden Gidden  49  25
Thursday 1 August 2019  Birmingham Academy  Ingunna Adin  60  20
Thursday 1 August 2019  Birmingham Academy  Janaya Mawford  52  18
Thursday 1 August 2019  Birmingham Academy  Cornall Offer  56  26
Thursday 1 August 2019  Birmingham Academy  Brnaby Bownes  59  26
Thursday 1 August 2019  Birmingham Academy  Sherline Cudihy  52  16
Thur

In [110]:
# Iterate through each row in the DataFrame and print it
for index, row in combined_df.iterrows():
    print(row)


date                  Thursday 1 August 2019
academy                   Birmingham Academy
name                         Hilary Willmore
psychometric_score                        51
presentation_score                        19
Name: 0, dtype: object
date                  Thursday 1 August 2019
academy                   Birmingham Academy
name                             Orly Lorens
psychometric_score                        51
presentation_score                        19
Name: 1, dtype: object
date                  Thursday 1 August 2019
academy                   Birmingham Academy
name                          Alvie Bleackly
psychometric_score                        55
presentation_score                        16
Name: 2, dtype: object
date                  Thursday 1 August 2019
academy                   Birmingham Academy
name                           Deck Itzchaki
psychometric_score                        59
presentation_score                        21
Name: 3, dtype: object
date    

In [111]:
# Check for missing values in the combined DataFrame
missing_values = combined_df.isnull().sum()

# Print the missing values
print("Missing values in combined_df:")
print(missing_values)


Missing values in combined_df:
date                  0
academy               0
name                  0
psychometric_score    0
presentation_score    0
dtype: int64


In [114]:
# Convert the 'date' column to date format
combined_df['date'] = pd.to_datetime(combined_df['date']).dt.date

# Verify the changes
print(combined_df.dtypes)



date                  object
academy               object
name                  object
psychometric_score     int32
presentation_score     int32
dtype: object


In [115]:
# Iterate through the DataFrame and print each row
for index, row in combined_df.iterrows():
    print(f"{row['date']}  {row['academy']}  {row['name']}  {row['psychometric_score']}  {row['presentation_score']}")

2019-08-01  Birmingham Academy  Hilary Willmore  51  19
2019-08-01  Birmingham Academy  Orly Lorens  51  19
2019-08-01  Birmingham Academy  Alvie Bleackly  55  16
2019-08-01  Birmingham Academy  Deck Itzchaki  59  21
2019-08-01  Birmingham Academy  Wilt Penritt  66  25
2019-08-01  Birmingham Academy  Gianna Clevely  61  16
2019-08-01  Birmingham Academy  Yvor Phalip  59  18
2019-08-01  Birmingham Academy  Marielle Knivett  52  18
2019-08-01  Birmingham Academy  Florri Loughton  63  28
2019-08-01  Birmingham Academy  Walden Gidden  49  25
2019-08-01  Birmingham Academy  Ingunna Adin  60  20
2019-08-01  Birmingham Academy  Janaya Mawford  52  18
2019-08-01  Birmingham Academy  Cornall Offer  56  26
2019-08-01  Birmingham Academy  Brnaby Bownes  59  26
2019-08-01  Birmingham Academy  Sherline Cudihy  52  16
2019-08-01  Birmingham Academy  Anallise Scathard  62  13
2019-08-01  Birmingham Academy  Benson Labuschagne  64  13
2019-08-01  Birmingham Academy  Norene Fierro  58  22
2019-08-01  B