In [1]:
import pandas as pd
from utils import generate_student_id

Lets load the 

In [2]:
df = pd.read_csv("data/Raw_Camp_Data.csv")

Remove unecessary rows

In [3]:
df = df.drop(df.index[34:])

The column names are quite long, lets change that by mapping the orignal column name to our desired column name using a dictionary. We then use the *rename* method in pandas to change the dataframes column names

In [4]:
column_mapping = {
    "Name of Participant (First Name, Last Name)": "Full Name",
    "Nick Name": "Nickname",
    "Payment": "Payment Received",
    "School Attending": "School",
    "Main Contact e-mail address": "Main Contact E-Mail",
    "Main Contact phone number": "Main Contact Number",
    "Optional Second e-mail address": "Secondary E-Mail",
    "Choose your group": "Age Group",
    "Optional Second phone number": "Secondary Phone Number",
    "Thank your for signing up, please add any comments you would like us to know about.": "Additional Comments",
    "I, the parent or guardian of the player named above, acknowledge that when my child is playing/participating/performing basketball activities s/he may suffer injury. I release Top Flight Basketball Co. Ltd from any liability concerning any injury or harm suffered by my child during or as a consequence of participation in the activities.": "Injury Liability Waver",
    "I, the parent or legal guardian of the child named above grant Top Flight basketball Co. Ltd my permission to use the photographs taken at basketball sessions for any legal use, including but not limited to: publicity, copyright purposes, illustration, advertising, and web content. Furthermore, I understand that no royalty, fee or other compensation shall become payable to me by reason of such use.": "Photograph Release Agreement",
    "Once booking made payments can be made to the following bank account: Top Flight Basketball Company Limited - HSBC – 023-697444-838. Please send us proof of payment to INFO@TOPFLIGHTHONGKONG.COM with your child's name indicated.": "Payment Instruction Acknowledgement"
}

df.rename(columns=column_mapping, inplace=True)

In [5]:
# df.head()

Lets remove anything that would give away the identity of our customers or columns we dont need. This would include columns such as "Email Address", "Nickname", "Main Contact Number", "Secondary Phone Number", "Main Contact E-Mail", "Age Group" and  "Secondary E-Mail". *We will not include "Full Name" as we will need it for the student id.*

In [6]:
id_cols_remove = ["Email Address", "Main Contact Number", "Main Contact E-Mail", "Secondary E-Mail", "Nickname", "Secondary Phone Number", "Age Group"]
[df.drop(x, axis=1, inplace=True) for x in id_cols_remove]

[None, None, None, None, None, None, None]

In [7]:
# df

After Skimming, I noticed some missing information. Two names are missing, lets fill them in row 32 and 33. They also attended every class, and are male. There are also four students missing information, lets add that information.

In [8]:
# Change misspelling in row 31 and Fill in Names at row 32 and 33 (Names are made up here)
df.at[31,'Full Name'] = 'Callum Wong'
df.at[32, 'Full Name'] = 'Anson Amande'
df.at[33, 'Full Name'] = 'Atlas Amande'
# Parent filled in two names, but only one of them attended
df.at[15, 'Full Name'] = 'George Tilton'
# Fill in Attendance at row 32 and 33 from Aug 19 - Aug 23
df.iloc[32, 2:7] = 'TRUE'
df.iloc[33, 2:7] = 'TRUE'
# Fill in Gender at row 32 and 33.
df.at[32, 'Gender'] = 'Male'
df.at[33, 'Gender'] = 'Male'
# Fill in missing Date of Birth for row 30:34
df.at[30,'Date of Birth'] = '1/1/2017'
df.at[31,'Date of Birth'] = '1/1/2017'
df.at[32,'Date of Birth'] = '10/14/2009'
df.at[33,'Date of Birth'] = '5/31/2008'
# Fill in missing Ages for row 30:34
df.at[30,'Age'] = '9'
df.at[31,'Age'] = '14'
df.at[32,'Age'] = '14'
df.at[33,'Age'] = '16'
# Fill in missing Schools
df.at[30,'School'] = 'Unknown'
df.at[31,'School'] = 'Canadian International School'
df.at[32,'School'] = 'DSC'
df.at[33,'School'] = 'DSC'

# df

In [9]:
school_mappings = {
    "DSC İnternational School": "DSC International School",
    "DSC": "DSC International School",
    "Dsc International School": "DSC International School",
    "ESF Island School": "ESF Island School",
    "Esf South Island School": "ESF South Island School",
    "South Island School": "ESF South Island School",
    "HKIS": "Hong Kong International School",
    "Hkis": "Hong Kong International School",
    "IMS": "International Montessori School",
    "Silvermine bay school": "Silvermine Bay School",
    "French International School": "French International School",
    "Victoria Shanghai Academy": "Victoria Shanghai Academy",
    "Hong Kong Harrow International School": "Harrow International School",
    "The ISF School": "ISF Academy",
    "CDNIS": "Canadian International School",
    "YMCA Christian Academy": "YMCA Christian Academy",
    "Chinese International School": "Chinese International School",
    "CIS": "Chinese International School",
    "Kellett": "Kellett School",
    "ESF SIS": "ESF South Island School",
    "Harrow School": "Harrow International School",
    "SJPS": "St. Joseph's Primary School",
    "AISHK": "Australian International School",
    "Australian International School": "Australian International School"
}

In [10]:
def standardize_school_name(school_name):
    return school_mappings.get(school_name.strip(), school_name)

df['School'] = df['School'].apply(standardize_school_name)

In [11]:
df['School'] = df['School'].apply(lambda x: ' '.join(word if word.upper() == 'ESF' else word.title() for word in x.split()))

In [12]:
# df

Lets create the student ID. We will use the Initials of First, Middle and Last Names followed by a hyphen ("-"), then the Year and Month of their date of birth (YYYYMM), and their Gender (Female = 0 and Male = 1) For example: Scott Matthew Summers 1977/09/22 Male = SMS-197709-01

In [13]:
student_id_cols = df[["Full Name", "Date of Birth", "Gender"]]
# student_id_cols

I will create a function *create_student_id* in my *utils.py* script

In [14]:
student_id_cols["Student ID"] = df.apply(generate_student_id, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  student_id_cols["Student ID"] = df.apply(generate_student_id, axis=1)


In [15]:
student_id_cols.drop(["Full Name"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  student_id_cols.drop(["Full Name"], axis=1, inplace=True)


In [16]:
student_id_cols

Unnamed: 0,Date of Birth,Gender,Student ID
0,12/22/2011,Female,DK-201112-0
1,4/26/2012,Male,DC-201204-1
2,9/12/2012,Male,JC-201209-1
3,4/10/2012,Male,EC-201204-1
4,10/27/2014,Male,EC-201410-1
5,8/22/2015,Male,LM-201508-1
6,10/17/2014,Male,AL-201410-1
7,6/7/2013,Male,AJ-201306-1
8,4/3/2012,Male,RJL-201204-1
9,9/19/2017,Male,LVH-201709-1


In [17]:
df["Student ID"] = student_id_cols["Student ID"]
# df

We want Student ID in the 2nd column, lets change that

In [18]:
cols = list(df.columns)
cols.insert(1, cols.pop(cols.index('Student ID')))
df = df[cols]

Now lets remove the Full Name column as we do not need it anymore

In [19]:
df.drop("Full Name",axis=1, inplace=True)
df.drop("Additional Comments", axis=1, inplace=True)

In [20]:
df.to_csv('data/Anonymized_Data.csv', encoding="utf-8", index=False)