# Ingestion and Cleaning of the academy csv files
In the academy bucket, there is data stored in csv files. This data describes each candidates assessed scores across their time in the academy.

In [82]:
import os, json, csv, boto3, datetime
import pandas as pd

In [83]:
s3 = boto3.client('s3')

### We create a function to list all object within a bucket

In [84]:
def list_objects(bucket, prefix):
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    if 'Contents' in response:
        return [obj['Key'] for obj in response['Contents'] if obj['Key'] != prefix]
    return []

In [85]:
def list_all_objects(bucket, prefix):
    all_objects = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' in response:
            all_objects.extend([obj['Key'] for obj in response['Contents']])
        
        if not response.get('NextContinuationToken'):
            break
        
        continuation_token = response['NextContinuationToken']
    
    return all_objects

## We load in the academy data.
Since we know that the academy bucket only contains these csv files, we don't need to perform any checks.

In [86]:
def load_academy_data(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    data_frames = []
    for file_key in files:
        obj = s3.get_object(Bucket=bucket, Key=file_key)
        df = pd.read_csv(obj['Body'])
        file_name = file_key.split("/")[1].split(".")[0]
        df.insert(0,'filename', file_name, True)
        data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

In [94]:
academy_data = load_academy_data('data-402-final-project', 'Academy/')

In [119]:
def clean_whitespace(text):
    try:
        return text.strip()
    except:
        return text

In [121]:
cols = list(academy_data.columns.values)
for col in cols:
    academy_data[col] = academy_data[col].appl

['Category', 'Stream', 'Date', 'name', 'trainer', 'Analytic_W1', 'Independent_W1', 'Determined_W1', 'Professional_W1', 'Studious_W1', 'Imaginative_W1', 'Analytic_W2', 'Independent_W2', 'Determined_W2', 'Professional_W2', 'Studious_W2', 'Imaginative_W2', 'Analytic_W3', 'Independent_W3', 'Determined_W3', 'Professional_W3', 'Studious_W3', 'Imaginative_W3', 'Analytic_W4', 'Independent_W4', 'Determined_W4', 'Professional_W4', 'Studious_W4', 'Imaginative_W4', 'Analytic_W5', 'Independent_W5', 'Determined_W5', 'Professional_W5', 'Studious_W5', 'Imaginative_W5', 'Analytic_W6', 'Independent_W6', 'Determined_W6', 'Professional_W6', 'Studious_W6', 'Imaginative_W6', 'Analytic_W7', 'Independent_W7', 'Determined_W7', 'Professional_W7', 'Studious_W7', 'Imaginative_W7', 'Analytic_W8', 'Independent_W8', 'Determined_W8', 'Professional_W8', 'Studious_W8', 'Imaginative_W8', 'Analytic_W9', 'Independent_W9', 'Determined_W9', 'Professional_W9', 'Studious_W9', 'Imaginative_W9', 'Analytic_W10', 'Independent_W10

### Next we need to extract the caterogry, stream name, and start date from the file name

In [95]:
def get_category(filename):
    splits = filename.split("_")
    category = splits[0]
    return category

In [96]:
academy_data['Category'] = academy_data['filename'].apply(get_category)

In [97]:
def get_stream(filename):
    splits = filename.split("_")
    stream_name = "".join(splits[0:2])
    return stream_name

In [98]:
academy_data['Stream'] = academy_data['filename'].apply(get_stream)

In [99]:
def get_date(filename):
    splits = filename.split("_")
    date = datetime.datetime.strptime(splits[2], "%Y-%m-%d").date()
    return date

In [100]:
academy_data['Date'] = academy_data['filename'].apply(get_date)

In [101]:
academy_data = academy_data[['Category', 'Stream', 'Date'] + [col for col in academy_data if col not in ['Category','Stream', 'Date','filename']]]

#### Next we check for any duplicated values

In [103]:
academy_data.duplicated().sum()

0

There are no duplicate values in our table
### Finally, we look to see if where there are null values
We don't want any null values in Category, Stram, Date, name, and trainer.

In [105]:
academy_data.isnull().sum()

Category              0
Stream                0
Date                  0
name                  0
trainer               0
                   ... 
Independent_W10     235
Determined_W10      235
Professional_W10    235
Studious_W10        235
Imaginative_W10     235
Length: 65, dtype: int64

In [116]:
academy_data['Category'] = academy_data['Category'].astype('str')

In [None]:
display(academy_data)