In [98]:
!pip install boto3



In [99]:
import boto3
import pandas as pd
import json
import csv
import os

In [100]:
s3 = boto3.client('s3')

In [101]:
def list_objects(bucket, prefix):
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    if 'Contents' in response:
        return [obj['Key'] for obj in response['Contents'] if obj['Key'] != prefix]
    return []

In [102]:
def list_all_objects(bucket, prefix):
    all_objects = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' in response:
            all_objects.extend([obj['Key'] for obj in response['Contents']])
        
        if not response.get('NextContinuationToken'):
            break
        
        continuation_token = response['NextContinuationToken']
    
    return all_objects


In [103]:
def get_file_types(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    file_types = {}
    for file in files:
        ext = os.path.splitext(file)[1].lower()
        if ext in file_types:
            file_types[ext] += 1
        else:
            file_types[ext] = 1
    return file_types

In [104]:
def print_file_types(bucket, prefix, name):
    file_types = get_file_types(bucket, prefix)
    print(f"{name}")
    for ext, count in file_types.items():
        print(f"{ext.upper()} files: {count}")
    other_files = sum([count for ext, count in file_types.items() if ext not in ['.csv', '.json', '.txt']])
    print(f"Other file types: {other_files}")
    print()


In [105]:
def load_academy_data(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    data_frames = []
    for file_key in files:
        obj = s3.get_object(Bucket=bucket, Key=file_key)
        df = pd.read_csv(obj['Body'])
        data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

In [106]:
def load_talent_data(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    records = []
    for file_key in files:
        obj = s3.get_object(Bucket=bucket, Key=file_key)
        content = obj['Body'].read().decode('utf-8')
        try:
            # Attempt to load the content as JSON
            data = json.loads(content)
            records.append(data)
        except json.JSONDecodeError:
            # If content is not JSON, try to parse as CSV
            try:
                csv_reader = csv.DictReader(content.splitlines())
                # Convert CSV rows to dicts & append to records list
                for row in csv_reader:
                    records.append(row)
            except Exception as e:
                print(f"Error loading file {file_key}: {e}")
                # Handle the error appropriately,-> skip the file or handle it differently
    return pd.DataFrame(records)

In [107]:
def list_columns(bucket, prefix, name):
    files = list_all_objects(bucket, prefix)
    columns = {}
    for file_key in files:
        obj = s3.get_object(Bucket=bucket, Key=file_key)
        ext = os.path.splitext(file_key)[1].lower()
        if ext == '.csv':
            df = pd.read_csv(obj['Body'])
            columns[ext] = list(df.columns)
        elif ext == '.json':
            content = obj['Body'].read().decode('utf-8')
            data = json.loads(content)
            if isinstance(data, list):
                columns[ext] = list(data[0].keys())
            elif isinstance(data, dict):
                columns[ext] = list(data.keys())
        elif ext == '.txt':
            content = obj['Body'].read().decode('utf-8')
            first_line = content.splitlines()[0]
            columns[ext] = first_line.split()
        else:
            columns[ext] = ['no columns']
    
    print(f"{name}")
    for ext, cols in columns.items():
        print(f"{ext.upper()} files columns: {', '.join(cols) if cols else 'no columns'}")
    print()

In [108]:
def check_merged_columns(merged_data, academy_columns, talent_columns):
    all_columns = set(academy_columns).union(set(talent_columns))
    missing_columns = all_columns.difference(set(merged_data.columns))
    
    if missing_columns:
        print("The following columns are missing from the merged data:")
        print(", ".join(missing_columns))
    else:
        print("All columns are present in the merged data.")

In [109]:
print_file_types('data-402-final-project', 'Academy/', 'Academy')
print_file_types('data-402-final-project', 'Talent/', 'Talent')

Academy
.CSV files: 36
Other file types: 0

Talent
.JSON files: 3105
.CSV files: 12
.TXT files: 152
Other file types: 0



In [110]:
list_columns('data-402-final-project', 'Academy/', 'Academy')
list_columns('data-402-final-project', 'Talent/', 'Talent')

Academy
.CSV files columns: name, trainer, Analytic_W1, Independent_W1, Determined_W1, Professional_W1, Studious_W1, Imaginative_W1, Analytic_W2, Independent_W2, Determined_W2, Professional_W2, Studious_W2, Imaginative_W2, Analytic_W3, Independent_W3, Determined_W3, Professional_W3, Studious_W3, Imaginative_W3, Analytic_W4, Independent_W4, Determined_W4, Professional_W4, Studious_W4, Imaginative_W4, Analytic_W5, Independent_W5, Determined_W5, Professional_W5, Studious_W5, Imaginative_W5, Analytic_W6, Independent_W6, Determined_W6, Professional_W6, Studious_W6, Imaginative_W6, Analytic_W7, Independent_W7, Determined_W7, Professional_W7, Studious_W7, Imaginative_W7, Analytic_W8, Independent_W8, Determined_W8, Professional_W8, Studious_W8, Imaginative_W8

Talent
.JSON files columns: name, date, tech_self_score, strengths, weaknesses, self_development, geo_flex, financial_support_self, result, course_interest
.CSV files columns: id, name, gender, dob, email, city, address, postcode, phone_

In [111]:
academy_data = load_academy_data('data-402-final-project', 'Academy/')
talent_data = load_talent_data('data-402-final-project', 'Talent/')

In [112]:
# Extract unique names from academy_data and talent_data
academy_names = set(academy_data['name'])
talent_names = set(talent_data['name'])

In [113]:
# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

In [114]:
print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [115]:
print("Dimensions of Academy Data:")
print(f"Number of rows: {academy_data.shape[0]}")
print(f"Number of columns: {academy_data.shape[1]}")

Dimensions of Academy Data:
Number of rows: 397
Number of columns: 62


In [116]:
print("\nDimensions of Talent Data:")
print(f"Number of rows: {talent_data.shape[0]}")
print(f"Number of columns: {talent_data.shape[1]}")


Dimensions of Talent Data:
Number of rows: 12082
Number of columns: 176


In [117]:
print("Academy Data:")
print(academy_data.head())

Academy Data:
              name       trainer  Analytic_W1  Independent_W1  Determined_W1  \
0  Quintus Penella  Gregor Gomez            1               2              2   
1     Simon Murrey  Gregor Gomez            6               1              1   
2      Gustaf Lude  Gregor Gomez            6               4              1   
3    Yolanda Fosse  Gregor Gomez            2               1              2   
4     Lynnett Swin  Gregor Gomez            2               2              4   

   Professional_W1  Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2  \
0                1            2               2          NaN             NaN   
1                2            4               2          3.0             1.0   
2                1            2               3          1.0             1.0   
3                3            3               3          4.0             2.0   
4                5            1               2          3.0             2.0   

   ...  Determined_W9  P

In [118]:
print("\nTalent Data:")
print(talent_data.head())


Talent Data:
                name        date  \
0  Stillmann Castano  22/08/2019   
1    Hilary Willmore  01/08/2019   
2      Efrem Whipple  22/08/2019   
3        Sydel Fenne  28/08/2019   
4    Michel Lebarree  07/08/2019   

                                     tech_self_score  \
0      {'C#': 6, 'Java': 5, 'R': 2, 'JavaScript': 2}   
1        {'Python': 1, 'C#': 4, 'Java': 2, 'C++': 4}   
2                              {'Ruby': 4, 'C++': 4}   
3                             {'Java': 3, 'SPSS': 4}   
4  {'Python': 3, 'Java': 4, 'Ruby': 1, 'R': 2, 'P...   

                             strengths                            weaknesses  \
0                           [Charisma]  [Distracted, Impulsive, Introverted]   
1  [Patient, Curious, Problem Solving]    [Overbearing, Chatty, Indifferent]   
2    [Courteous, Independent, Patient]     [Introverted, Impulsive, Anxious]   
3                         [Passionate]            [Perfectionist, Sensitive]   
4                          [Vers

In [119]:
print("\nMissing values in Academy Data:")
print(academy_data.isnull().sum())



Missing values in Academy Data:
name                  0
trainer               0
Analytic_W1           0
Independent_W1        0
Determined_W1         0
                   ... 
Independent_W10     235
Determined_W10      235
Professional_W10    235
Studious_W10        235
Imaginative_W10     235
Length: 62, dtype: int64


In [120]:
academy_data_filled = academy_data.fillna(0)

In [121]:
print("\nMissing values in Academy Data:")
print(academy_data.isnull().sum())


Missing values in Academy Data:
name                  0
trainer               0
Analytic_W1           0
Independent_W1        0
Determined_W1         0
                   ... 
Independent_W10     235
Determined_W10      235
Professional_W10    235
Studious_W10        235
Imaginative_W10     235
Length: 62, dtype: int64


In [122]:
print("\nMissing values in Talent Data:")
print(talent_data.isnull().sum())


Missing values in Talent Data:
name                         4286
date                         8977
tech_self_score              9032
strengths                    8977
weaknesses                   8977
                            ...  
Tuesday 9 April 2019        12047
Wednesday 9 January 2019    12060
Tuesday 9 July 2019         12042
Thursday 9 May 2019         12057
Wednesday 9 October 2019    12061
Length: 176, dtype: int64


In [123]:
talent_data_filled = talent_data.fillna({
    'tech_self_score': '{}', 
    'strengths': '[]', 
    'weaknesses': '[]',
    'self_development': 'No', 
    'geo_flex': 'No', 
    'financial_support_self': 'No', 
    'course_interest': 'None'
})

In [124]:
print("\nMissing values in Talent Data:")
print(talent_data.isnull().sum())


Missing values in Talent Data:
name                         4286
date                         8977
tech_self_score              9032
strengths                    8977
weaknesses                   8977
                            ...  
Tuesday 9 April 2019        12047
Wednesday 9 January 2019    12060
Tuesday 9 July 2019         12042
Thursday 9 May 2019         12057
Wednesday 9 October 2019    12061
Length: 176, dtype: int64


In [125]:
# Extract unique names from academy_data and talent_data
academy_names = set(academy_data_filled['name'])
talent_names = set(talent_data_filled['name'])

In [126]:
# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

In [127]:
print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [128]:
# Convert dicts and lists to strings in specified columns
talent_data_filled['tech_self_score'] = talent_data_filled['tech_self_score'].apply(json.dumps)
talent_data_filled['strengths'] = talent_data_filled['strengths'].apply(json.dumps)
talent_data_filled['weaknesses'] = talent_data_filled['weaknesses'].apply(json.dumps)

In [129]:
############ THIS IS A TESTTTR!!!!!!!!!!

# Extract unique names from academy_data and talent_data
academy_names = set(academy_data_filled['name'])
talent_names = set(talent_data_filled['name'])

In [130]:
# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

In [131]:
print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [132]:
import ast

talent_data_filled['tech_self_score'] = talent_data_filled['tech_self_score'].apply(ast.literal_eval)
talent_data_filled['strengths'] = talent_data_filled['strengths'].apply(ast.literal_eval)
talent_data_filled['weaknesses'] = talent_data_filled['weaknesses'].apply(ast.literal_eval)

print("Data types after converting JSON strings back to lists/dictionaries:")
print(talent_data_filled.dtypes)

Data types after converting JSON strings back to lists/dictionaries:
name                        object
date                        object
tech_self_score             object
strengths                   object
weaknesses                  object
                             ...  
Tuesday 9 April 2019        object
Wednesday 9 January 2019    object
Tuesday 9 July 2019         object
Thursday 9 May 2019         object
Wednesday 9 October 2019    object
Length: 176, dtype: object


In [133]:
############ THIS IS A TESTTTR!!!!!!!!!!

# Extract unique names from academy_data and talent_data
academy_names = set(academy_data_filled['name'])
talent_names = set(talent_data_filled['name'])

In [134]:
# Check if each name in academy_data exists in talent_data
common_names = [name for name in academy_names if name in talent_names]

In [135]:
print("Number of names from academy_data found in talent_data:", len(common_names))

Number of names from academy_data found in talent_data: 397


In [136]:
# Merge academy_data_filled and talent_data_filled on the 'name' column
merged_data = pd.merge(academy_data_filled, talent_data_filled, on='name', how='inner')

In [137]:
print("\nMerged Data:")
print(merged_data.head())



Merged Data:
              name       trainer  Analytic_W1  Independent_W1  Determined_W1  \
0  Quintus Penella  Gregor Gomez            1               2              2   
1  Quintus Penella  Gregor Gomez            1               2              2   
2     Simon Murrey  Gregor Gomez            6               1              1   
3     Simon Murrey  Gregor Gomez            6               1              1   
4      Gustaf Lude  Gregor Gomez            6               4              1   

   Professional_W1  Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2  \
0                1            2               2          0.0             0.0   
1                1            2               2          0.0             0.0   
2                2            4               2          3.0             1.0   
3                2            4               2          3.0             1.0   
4                1            2               3          1.0             1.0   

   ...  Thursday 7 Novem

In [138]:
# Extract column names from both academy_data and talent_data
academy_columns = list(academy_data.columns)
talent_columns = list(talent_data.columns)

In [139]:
# Check if all columns are present in the merged dataframe
check_merged_columns(merged_data, academy_columns, talent_columns)

All columns are present in the merged data.


In [140]:
# Test search for a user
simon_murrey_records = merged_data[merged_data['name'] == 'Simon Murrey']
simon_murrey_records_all_columns = merged_data.loc[merged_data['name'] == 'Simon Murrey']
# Filtered records for Simon Murrey
print("Filtered records for Simon Murrey:")
print(simon_murrey_records)

# All columns for Simon Murrey
print("\nAll columns for Simon Murrey:")
print(simon_murrey_records_all_columns)

Filtered records for Simon Murrey:
           name       trainer  Analytic_W1  Independent_W1  Determined_W1  \
2  Simon Murrey  Gregor Gomez            6               1              1   
3  Simon Murrey  Gregor Gomez            6               1              1   

   Professional_W1  Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2  \
2                2            4               2          3.0             1.0   
3                2            4               2          3.0             1.0   

   ...  Thursday 7 November 2019  Thursday 8 August 2019  \
2  ...                       NaN                     NaN   
3  ...                       NaN                     NaN   

   Tuesday 8 January 2019  Wednesday 8 May 2019  Tuesday 8 October 2019  \
2                     NaN                   NaN                     NaN   
3                     NaN                   NaN                     NaN   

   Tuesday 9 April 2019  Wednesday 9 January 2019  Tuesday 9 July 2019  \
2          

In [145]:
# Test search for a user
gregor_gomez_records = merged_data[merged_data['name'] == 'Gregor Gomez']
gregor_gomez_records_all_columns = merged_data.loc[merged_data['name'] == 'Gregor Gomez']
# Filtered records for Gregor Gomez
print("Filtered records for Gregor Gomez:")
print(gregor_gomez_records)

# All columns for Gregor Gomez
print("\nAll columns for Gregor Gomez:")
print(gregor_gomez_records_all_columns)

# Will return empty df as this is a trainer who will not be in some of the other tables 

Filtered records for Gregor Gomez:
Empty DataFrame
Columns: [name, trainer, Analytic_W1, Independent_W1, Determined_W1, Professional_W1, Studious_W1, Imaginative_W1, Analytic_W2, Independent_W2, Determined_W2, Professional_W2, Studious_W2, Imaginative_W2, Analytic_W3, Independent_W3, Determined_W3, Professional_W3, Studious_W3, Imaginative_W3, Analytic_W4, Independent_W4, Determined_W4, Professional_W4, Studious_W4, Imaginative_W4, Analytic_W5, Independent_W5, Determined_W5, Professional_W5, Studious_W5, Imaginative_W5, Analytic_W6, Independent_W6, Determined_W6, Professional_W6, Studious_W6, Imaginative_W6, Analytic_W7, Independent_W7, Determined_W7, Professional_W7, Studious_W7, Imaginative_W7, Analytic_W8, Independent_W8, Determined_W8, Professional_W8, Studious_W8, Imaginative_W8, Analytic_W9, Independent_W9, Determined_W9, Professional_W9, Studious_W9, Imaginative_W9, Analytic_W10, Independent_W10, Determined_W10, Professional_W10, Studious_W10, Imaginative_W10, date, tech_self_sc

In [142]:
# Fill all NaN values in the entire dataframe with 0
#merged_data_filled = merged_data.fillna(0)

# Print the first few rows to verify
#print("Merged Data with NaN values filled with 0:")
#print(merged_data_filled.head())


In [143]:
# Fill all NaN values in the entire dataframe with 0
#merged_data_filled = merged_data.fillna(0)

# Test search for a user
#gregor_gomez_records = merged_data_filled[merged_data_filled['name'] == 'Gregor Gomez']
#gregor_gomez_records_all_columns = merged_data_filled.loc[merged_data_filled['name'] == 'Gregor Gomez']

# Filtered records for Gregor Gomez
#print("Filtered records for Gregor Gomez:")
#print(gregor_gomez_records)

# All columns for Gregor Gomez
#print("\nAll columns for Gregor Gomez:")
#print(gregor_gomez_records_all_columns)


In [144]:
#print("Unique names in the dataset:")
#print(merged_data_filled['name'].unique())
