In [42]:
import pandas as pd
import numpy as np
import datetime
import os, glob, json, csv
import boto3
import re

In [2]:
s3 = boto3.client('s3')

In [3]:
def list_objects(bucket, prefix):
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    if 'Contents' in response:
        return [obj['Key'] for obj in response['Contents'] if obj['Key'] != prefix]
    return []

In [4]:
def list_all_objects(bucket, prefix):
    all_objects = []
    continuation_token = None
    
    while True:
        if continuation_token:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=continuation_token)
        else:
            response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
        
        if 'Contents' in response:
            all_objects.extend([obj['Key'] for obj in response['Contents']])
        
        if not response.get('NextContinuationToken'):
            break
        
        continuation_token = response['NextContinuationToken']
    
    return all_objects

In [5]:
def load_talent_data(bucket, prefix):
    files = list_all_objects(bucket, prefix)
    data_frames = []
    for file_key in files:
        if file_key.split(".")[-1] == "csv":
            obj = s3.get_object(Bucket=bucket, Key=file_key)
            df = pd.read_csv(obj['Body'])
            file_name = file_key.split("/")[1].split(".")[0][:-10]
            df.insert(1,'filename', file_name, True)
            data_frames.append(df)
        else:
            continue
    return pd.concat(data_frames, ignore_index=True)

In [6]:
talent_data = load_talent_data('data-402-final-project', 'Talent/')

In [7]:
talent_data

Unnamed: 0,id,filename,name,gender,dob,email,city,address,postcode,phone_number,uni,degree,invited_date,month,invited_by
0,1,April2019,Esme Trusslove,Female,04/08/1994,etrusslove0@google.es,Swindon,22056 Lerdahl Avenue,SN1,+44-295-783-0228,"Saint George's Hospital Medical School, Univer...",2:1,10.0,April 2019,Bruno Bellbrook
1,2,April2019,Matthaeus Audas,Male,,maudas1@mapquest.com,Charlton,263 Nelson Trail,OX12,+44-957-728-0155,Keele University,2:1,30.0,April 2019,Doris Bellasis
2,3,April2019,Cherey Tollfree,Female,08/12/1992,ctollfree2@netvibes.com,Weston,69 Coleman Court,GU32,+44-588-749-6002,"King's College London, University of London",2:1,25.0,April 2019,Gismo Tilling
3,4,April2019,Eryn Speers,Female,,espeers3@shinystat.com,Sutton,0166 Daystar Drive,CT15,+44 148 787 0613,University of Edinburgh,2:1,,,
4,5,April2019,Theadora Berkelay,Female,03/11/1995,tberkelay4@godaddy.com,Upton,6 Mandrake Crossing,WF9,+44 (841) 468-3619,University of Leicester,2:1,2.0,April 2019,Stacey Broad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,416,Sept2019,Clyve Gillhespy,Male,15/12/1995,cgillhespybj@buzzfeed.com,Birmingham,33 Almo Avenue,B40,+44-904-343-2218,University of Liverpool,2:1,26.0,SEPT 2019,Bruno Bellbrook
4687,417,Sept2019,Vaclav Pietesch,Male,09/11/1994,vpieteschbk@mac.com,Whitwell,6476 Hoffman Terrace,DL10,+44 455 631 6125,Sheffield Hallam University,2:2,12.0,SEPT 2019,Stacey Broad
4688,418,Sept2019,Kassi Lucio,Female,24/04/1994,kluciobl@exblog.jp,Normanton,06 Fulton Center,LE15,+44-834-342-9323,University of Buckingham,2:1,3.0,SEPT 2019,Fifi Eton
4689,419,Sept2019,Vivianna Letty,Female,,vlettybm@google.com.hk,Sheffield,78314 Fisk Plaza,S1,+44-534-758-3140,Leeds Metropolitan University,1st,19.0,SEPT 2019,Bruno Bellbrook


In [8]:
#talent_data.isna().sum()

In [9]:
#talent_data[talent_data["invited_by"].isna()]

In [10]:
talent_data.dtypes #["phone_number"].type()

id                int64
filename         object
name             object
gender           object
dob              object
email            object
city             object
address          object
postcode         object
phone_number     object
uni              object
degree           object
invited_date    float64
month            object
invited_by       object
dtype: object

## Phone Numbers Column Cleaning

In [11]:
# to_replace = ["-", " ", "(", ")"]
def clean_phone_numbers(phone_no):
    to_replace = ["-", " ", "(", ")"]
    try:
        for item in to_replace:
            phone_no = phone_no.replace(item, "")
        return phone_no
    except:
        return phone_no

In [12]:
talent_data['phone_number'] = talent_data['phone_number'].apply(clean_phone_numbers)

In [13]:
#clean_phone_numbers(talent_data["phone_number"][0])

In [14]:
talent_data[talent_data["invited_by"].isna()].isna().sum()

id                0
filename          0
name              0
gender           18
dob              22
email            34
city              9
address           9
postcode          9
phone_number     48
uni              16
degree           16
invited_date    557
month           557
invited_by      557
dtype: int64

In [15]:
#talent_data["month"].unique()

In [16]:
#talent_data["filename"].unique()

In [17]:
def clean_month(month_name):
    #print(month_name)
    if month_name:
        month_name2 = month_name[:-4].strip()
        year = month_name[-4:].strip()
        month_name2 = month_name2.capitalize()
        if month_name2 == "Sept":
            month_name2 = "September"
        cleaned_month = month_name2 + '-' + year
        return cleaned_month
    else:
        return month_name

In [18]:
talent_data['month'] = talent_data['month'].apply(lambda x: clean_month(x) if (np.all(pd.notnull(x))) else x)

In [19]:
talent_data["month"][3] is np.nan

True

In [20]:
#talent_data

## Sparta Day Column

In [21]:
def combine_date_and_month(invited_day, month):
    try:
        invited_date = str(int(invited_day)) + '-' + month
        return_date = datetime.datetime.strptime(invited_date, '%d-%B-%Y').date()
        return return_date
    except:
        return month

In [22]:
talent_data['sparta_day_date'] = talent_data.apply(lambda x : combine_date_and_month(x['invited_date'], x['month']), axis = 1)

In [23]:
talent_data

Unnamed: 0,id,filename,name,gender,dob,email,city,address,postcode,phone_number,uni,degree,invited_date,month,invited_by,sparta_day_date
0,1,April2019,Esme Trusslove,Female,04/08/1994,etrusslove0@google.es,Swindon,22056 Lerdahl Avenue,SN1,+442957830228,"Saint George's Hospital Medical School, Univer...",2:1,10.0,April-2019,Bruno Bellbrook,2019-04-10
1,2,April2019,Matthaeus Audas,Male,,maudas1@mapquest.com,Charlton,263 Nelson Trail,OX12,+449577280155,Keele University,2:1,30.0,April-2019,Doris Bellasis,2019-04-30
2,3,April2019,Cherey Tollfree,Female,08/12/1992,ctollfree2@netvibes.com,Weston,69 Coleman Court,GU32,+445887496002,"King's College London, University of London",2:1,25.0,April-2019,Gismo Tilling,2019-04-25
3,4,April2019,Eryn Speers,Female,,espeers3@shinystat.com,Sutton,0166 Daystar Drive,CT15,+441487870613,University of Edinburgh,2:1,,,,
4,5,April2019,Theadora Berkelay,Female,03/11/1995,tberkelay4@godaddy.com,Upton,6 Mandrake Crossing,WF9,+448414683619,University of Leicester,2:1,2.0,April-2019,Stacey Broad,2019-04-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,416,Sept2019,Clyve Gillhespy,Male,15/12/1995,cgillhespybj@buzzfeed.com,Birmingham,33 Almo Avenue,B40,+449043432218,University of Liverpool,2:1,26.0,September-2019,Bruno Bellbrook,2019-09-26
4687,417,Sept2019,Vaclav Pietesch,Male,09/11/1994,vpieteschbk@mac.com,Whitwell,6476 Hoffman Terrace,DL10,+444556316125,Sheffield Hallam University,2:2,12.0,September-2019,Stacey Broad,2019-09-12
4688,418,Sept2019,Kassi Lucio,Female,24/04/1994,kluciobl@exblog.jp,Normanton,06 Fulton Center,LE15,+448343429323,University of Buckingham,2:1,3.0,September-2019,Fifi Eton,2019-09-03
4689,419,Sept2019,Vivianna Letty,Female,,vlettybm@google.com.hk,Sheffield,78314 Fisk Plaza,S1,+445347583140,Leeds Metropolitan University,1st,19.0,September-2019,Bruno Bellbrook,2019-09-19


In [24]:
talent_data = talent_data.drop(['invited_date', 'month'], axis = 1)

In [25]:
# for item in talent_data["dob"]:
#     print(item, type(item))

In [26]:
def dobs_to_datetime(date_ofb):
    try:
        strp_dob = datetime.datetime.strptime(date_ofb, '%d/%m/%Y').date()
        return strp_dob
    except:
        return date_ofb

In [27]:
talent_data["dob"] = talent_data["dob"].apply(dobs_to_datetime)

In [28]:
talent_data["degree"].unique()

array(['2:1', '1st', '3rd', nan, '2:2'], dtype=object)

## Capitalising the names

In [29]:
def capital_names(name_str):
    try:
        name_str = name_str.title()
        return name_str
    except:
        return name_str

In [30]:
talent_data['name'] = talent_data['name'].apply(capital_names)

## Capitalising the street addresses

In [31]:
def capital_addresses(address_str):
    try:
        address_str = address_str.title()
        return address_str
    except:
        return address_str

In [32]:
talent_data['address'] = talent_data['address'].apply(capital_addresses)

In [33]:
talent_data

Unnamed: 0,id,filename,name,gender,dob,email,city,address,postcode,phone_number,uni,degree,invited_by,sparta_day_date
0,1,April2019,Esme Trusslove,Female,1994-08-04,etrusslove0@google.es,Swindon,22056 Lerdahl Avenue,SN1,+442957830228,"Saint George's Hospital Medical School, Univer...",2:1,Bruno Bellbrook,2019-04-10
1,2,April2019,Matthaeus Audas,Male,,maudas1@mapquest.com,Charlton,263 Nelson Trail,OX12,+449577280155,Keele University,2:1,Doris Bellasis,2019-04-30
2,3,April2019,Cherey Tollfree,Female,1992-12-08,ctollfree2@netvibes.com,Weston,69 Coleman Court,GU32,+445887496002,"King's College London, University of London",2:1,Gismo Tilling,2019-04-25
3,4,April2019,Eryn Speers,Female,,espeers3@shinystat.com,Sutton,0166 Daystar Drive,CT15,+441487870613,University of Edinburgh,2:1,,
4,5,April2019,Theadora Berkelay,Female,1995-11-03,tberkelay4@godaddy.com,Upton,6 Mandrake Crossing,WF9,+448414683619,University of Leicester,2:1,Stacey Broad,2019-04-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,416,Sept2019,Clyve Gillhespy,Male,1995-12-15,cgillhespybj@buzzfeed.com,Birmingham,33 Almo Avenue,B40,+449043432218,University of Liverpool,2:1,Bruno Bellbrook,2019-09-26
4687,417,Sept2019,Vaclav Pietesch,Male,1994-11-09,vpieteschbk@mac.com,Whitwell,6476 Hoffman Terrace,DL10,+444556316125,Sheffield Hallam University,2:2,Stacey Broad,2019-09-12
4688,418,Sept2019,Kassi Lucio,Female,1994-04-24,kluciobl@exblog.jp,Normanton,06 Fulton Center,LE15,+448343429323,University of Buckingham,2:1,Fifi Eton,2019-09-03
4689,419,Sept2019,Vivianna Letty,Female,,vlettybm@google.com.hk,Sheffield,78314 Fisk Plaza,S1,+445347583140,Leeds Metropolitan University,1st,Bruno Bellbrook,2019-09-19


In [34]:
talent_data.dtypes

id                  int64
filename           object
name               object
gender             object
dob                object
email              object
city               object
address            object
postcode           object
phone_number       object
uni                object
degree             object
invited_by         object
sparta_day_date    object
dtype: object

## Removing the whitespace

In [35]:
def whitespace_delete(column):
    try:
        return column.strip()
    except:
        return column

In [36]:
for column in list(talent_data.columns):
    talent_data[column] = talent_data[column].apply(whitespace_delete)

In [37]:
talent_data

Unnamed: 0,id,filename,name,gender,dob,email,city,address,postcode,phone_number,uni,degree,invited_by,sparta_day_date
0,1,April2019,Esme Trusslove,Female,1994-08-04,etrusslove0@google.es,Swindon,22056 Lerdahl Avenue,SN1,+442957830228,"Saint George's Hospital Medical School, Univer...",2:1,Bruno Bellbrook,2019-04-10
1,2,April2019,Matthaeus Audas,Male,,maudas1@mapquest.com,Charlton,263 Nelson Trail,OX12,+449577280155,Keele University,2:1,Doris Bellasis,2019-04-30
2,3,April2019,Cherey Tollfree,Female,1992-12-08,ctollfree2@netvibes.com,Weston,69 Coleman Court,GU32,+445887496002,"King's College London, University of London",2:1,Gismo Tilling,2019-04-25
3,4,April2019,Eryn Speers,Female,,espeers3@shinystat.com,Sutton,0166 Daystar Drive,CT15,+441487870613,University of Edinburgh,2:1,,
4,5,April2019,Theadora Berkelay,Female,1995-11-03,tberkelay4@godaddy.com,Upton,6 Mandrake Crossing,WF9,+448414683619,University of Leicester,2:1,Stacey Broad,2019-04-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,416,Sept2019,Clyve Gillhespy,Male,1995-12-15,cgillhespybj@buzzfeed.com,Birmingham,33 Almo Avenue,B40,+449043432218,University of Liverpool,2:1,Bruno Bellbrook,2019-09-26
4687,417,Sept2019,Vaclav Pietesch,Male,1994-11-09,vpieteschbk@mac.com,Whitwell,6476 Hoffman Terrace,DL10,+444556316125,Sheffield Hallam University,2:2,Stacey Broad,2019-09-12
4688,418,Sept2019,Kassi Lucio,Female,1994-04-24,kluciobl@exblog.jp,Normanton,06 Fulton Center,LE15,+448343429323,University of Buckingham,2:1,Fifi Eton,2019-09-03
4689,419,Sept2019,Vivianna Letty,Female,,vlettybm@google.com.hk,Sheffield,78314 Fisk Plaza,S1,+445347583140,Leeds Metropolitan University,1st,Bruno Bellbrook,2019-09-19


In [38]:
talent_data.sort_values(by=['name'])

Unnamed: 0,id,filename,name,gender,dob,email,city,address,postcode,phone_number,uni,degree,invited_by,sparta_day_date
3344,224,May2019,Aarika Preist,Female,1994-09-29,apreist67@webmd.com,Sutton,75 Meadow Vale Trail,CT15,+449499251536,"Heythrop College, University of London",1st,Bruno Bellbrook,2019-05-08
3615,104,Nov2019,Aaron Bowers,Male,2000-05-08,abowers2v@umn.edu,Kinloch,01676 Mockingbird Street,PH43,+448575875195,"Institute of Germanic Studies, University of L...",2:2,,
935,104,Dec2019,Ab Macphail,Male,1998-06-06,amacphail2v@umn.edu,Aston,2461 Clemons Hill,TF6,+443663561499,University of Lancaster,2:1,,
1311,195,Feb2019,Abbe Ballard,Female,1993-02-23,,Newport,50 Calypso Hill,NR29,+441526940109,University of Birmingham,2:1,Sunny Sladefield,2019-02-21
2753,1,March2019,Abbe Hanny,Female,1999-11-07,,Glasgow,5 Kim Way,G4,+442683228947,University of Birmingham,2:2,Bruno Bellbrook,2019-03-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2984,232,March2019,Zonda Babbidge,Female,1993-01-07,zbabbidge6f@theguardian.com,Eaton,60905 Shelley Trail,DN22,+441347305492,University of Wolverhampton,2:2,Sunny Sladefield,2019-03-07
4513,243,Sept2019,Zondra Fippe,Female,1991-05-11,zfippe6q@netvibes.com,Weston,51 Kinsman Avenue,GU32,+448681530051,Stratford College London,2:1,Fifi Etton,2019-09-17
4187,316,Oct2019,Zondra Lindgren,Female,1997-08-04,zlindgren8r@redcross.org,,,,+447724672913,London College of Science & Technology,3rd,Fifi Etton,2019-10-16
4307,37,Sept2019,Zorah Grasser,Female,1993-03-16,zgrasser10@hao123.com,Swindon,39 Myrtle Way,SN1,+444233329884,University of Worcester,2:2,Doris Bellasis,2019-09-19


In [39]:
def capitalise_names(name):
    new_name = name.title()
    return new_name

In [40]:
talent_data['name'] = talent_data['name'].apply(capitalise_names)

In [46]:
def remove_wildcards(name):
    new_name = re.sub("[^A-Za-z'' -]", "", name)
    return new_name

In [47]:
for name in ['Chrisse Santostefano.', 'Keen Bentham3', 'L;Urette Daveley']:
    print(remove_wildcards(name))

Chrisse Santostefano
Keen Bentham
LUrette Daveley


In [48]:
talent_data['name'] = talent_data['name'].apply(remove_wildcards)

In [49]:
talent_data

Unnamed: 0,id,filename,name,gender,dob,email,city,address,postcode,phone_number,uni,degree,invited_by,sparta_day_date
0,1,April2019,Esme Trusslove,Female,1994-08-04,etrusslove0@google.es,Swindon,22056 Lerdahl Avenue,SN1,+442957830228,"Saint George's Hospital Medical School, Univer...",2:1,Bruno Bellbrook,2019-04-10
1,2,April2019,Matthaeus Audas,Male,,maudas1@mapquest.com,Charlton,263 Nelson Trail,OX12,+449577280155,Keele University,2:1,Doris Bellasis,2019-04-30
2,3,April2019,Cherey Tollfree,Female,1992-12-08,ctollfree2@netvibes.com,Weston,69 Coleman Court,GU32,+445887496002,"King's College London, University of London",2:1,Gismo Tilling,2019-04-25
3,4,April2019,Eryn Speers,Female,,espeers3@shinystat.com,Sutton,0166 Daystar Drive,CT15,+441487870613,University of Edinburgh,2:1,,
4,5,April2019,Theadora Berkelay,Female,1995-11-03,tberkelay4@godaddy.com,Upton,6 Mandrake Crossing,WF9,+448414683619,University of Leicester,2:1,Stacey Broad,2019-04-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,416,Sept2019,Clyve Gillhespy,Male,1995-12-15,cgillhespybj@buzzfeed.com,Birmingham,33 Almo Avenue,B40,+449043432218,University of Liverpool,2:1,Bruno Bellbrook,2019-09-26
4687,417,Sept2019,Vaclav Pietesch,Male,1994-11-09,vpieteschbk@mac.com,Whitwell,6476 Hoffman Terrace,DL10,+444556316125,Sheffield Hallam University,2:2,Stacey Broad,2019-09-12
4688,418,Sept2019,Kassi Lucio,Female,1994-04-24,kluciobl@exblog.jp,Normanton,06 Fulton Center,LE15,+448343429323,University of Buckingham,2:1,Fifi Eton,2019-09-03
4689,419,Sept2019,Vivianna Letty,Female,,vlettybm@google.com.hk,Sheffield,78314 Fisk Plaza,S1,+445347583140,Leeds Metropolitan University,1st,Bruno Bellbrook,2019-09-19
