In [271]:
# Data Staging for graduates Data

In [272]:
## Importing Libraries and Reading the Data

In [273]:
import pandas as pd

df = pd.read_csv('data/raw_graduates.csv')

In [274]:
## Previewing the Dataset

In [275]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Institution type,Program type,Credential type,Field of study,Gender,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study","Total, gender",Number,223,units,0,v79575621,1.1.1.1.1.1,235932.0,,,,0
1,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Man,Number,223,units,0,v79575622,1.1.1.1.1.2,101733.0,,,,0
2,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Woman,Number,223,units,0,v79575623,1.1.1.1.1.3,134196.0,,,,0
3,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Gender unknown,Number,223,units,0,v79575624,1.1.1.1.1.4,,..,,,0
4,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type",Personal improvement and leisure [0],"Total, gender",Number,223,units,0,v79575625,1.1.1.1.2.1,12.0,,,,0


## Unique Values in Each Column

In [276]:
unique_values = df[['GEO', 'DGUID', 'UOM','Institution type', 'Program type', 'Credential type', 'Field of study', 'Gender', 'UOM_ID','SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR' , 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']].apply(lambda x: x.unique())
unique_values

GEO                 [Canada, Newfoundland and Labrador, Prince Edw...
DGUID               [2016A000011124, 2016A000210, 2016A000211, 201...
UOM                                                          [Number]
Institution type       [Total, institution type, University, College]
Program type        [Total, program type, Basic education and skil...
Credential type     [Total, credential type, General Equivalency D...
Field of study      [Total, field of study, Personal improvement a...
Gender                    [Total, gender, Man, Woman, Gender unknown]
UOM_ID                                                          [223]
SCALAR_FACTOR                                                 [units]
SCALAR_ID                                                         [0]
VECTOR              [v79575621, v79575622, v79575623, v79575624, v...
COORDINATE          [1.1.1.1.1.1, 1.1.1.1.1.2, 1.1.1.1.1.3, 1.1.1....
VALUE               [235932.0, 101733.0, 134196.0, nan, 12.0, 0.0,...
STATUS              

## Dropping Unnecessary Columns

In [277]:
df = df.drop(columns=['GEO', 'DGUID', 'UOM', 'UOM_ID', 'SCALAR_FACTOR',  'SCALAR_ID', 'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'SYMBOL', 'TERMINATED', 'DECIMALS'])

In [278]:
df.head()

Unnamed: 0,REF_DATE,Institution type,Program type,Credential type,Field of study,Gender,VALUE
0,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study","Total, gender",235932.0
1,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Man,101733.0
2,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Woman,134196.0
3,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Gender unknown,
4,1992,"Total, institution type","Total, program type","Total, credential type",Personal improvement and leisure [0],"Total, gender",12.0


## Renaming Columns for Clarity and Simplicity

In [279]:
df = df.rename(columns={'REF_DATE': 'Date', 'VALUE': 'Value', 'Institution type': 'Institution Type', 'Program type': 'Program Type', 'Credential type': 'Credential Type', 'Field of study': 'Field Of Study'})

In [280]:
df.dtypes

Date                  int64
Institution Type     object
Program Type         object
Credential Type      object
Field Of Study       object
Gender               object
Value               float64
dtype: object

## Remove any totals to be as detailed as possible (or unnecessary rows for consistency through all datasets)

In [281]:
# Function to check if a record contains any totals
def check_total(record):
    return record.split(', ')[0] == 'Total'

In [282]:
# Remove total gender and gender unknown
df = df[df['Gender'] != 'Gender unknown']
updated_gender_df = df.drop(df[df['Gender'].apply(check_total)].index)

# Remove total credential type
updated_credential_df = updated_gender_df.drop(updated_gender_df[updated_gender_df['Credential Type'].apply(check_total)].index)

# Remove total program type
updated_program_df = updated_credential_df.drop(updated_credential_df[updated_credential_df['Program Type'].apply(check_total)].index)

# Remove total institution type
updated_institution_df = updated_program_df.drop(updated_program_df[updated_program_df['Institution Type'].apply(check_total)].index)

# Remove total field of study
updated_field_df = updated_institution_df.drop(updated_institution_df[updated_institution_df['Field Of Study'].apply(check_total)].index)


updated_field_df.reset_index(drop=True, inplace=True)

updated_field_df

Unnamed: 0,Date,Institution Type,Program Type,Credential Type,Field Of Study,Gender,Value
0,1992,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Man,
1,1992,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Woman,
2,1992,University,Basic education and skills program,Certificate,Other [12],Man,
3,1992,University,Basic education and skills program,Certificate,Other [12],Woman,
4,1992,University,Basic education and skills program,Attestation and other short program credentials,Personal improvement and leisure [0],Man,
...,...,...,...,...,...,...,...
114374,2021,College,Undergraduate program,Diploma,"Agriculture, natural resources and conservatio...",Woman,0.0
114375,2021,College,Other programs,Certificate,Education [1],Man,3.0
114376,2021,College,Other programs,Certificate,Education [1],Woman,0.0
114377,2021,College,Other programs,Diploma,"Business, management and public administration...",Man,3.0


## Filtering Date to Start from 2000 and end at 2020

In [283]:
df = updated_field_df[updated_field_df['Date'] >= 2000]
df = df[df['Date'] <= 2020]
df.reset_index(drop=True, inplace=True)

In [284]:
df

Unnamed: 0,Date,Institution Type,Program Type,Credential Type,Field Of Study,Gender,Value
0,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Man,
1,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Woman,
2,2000,University,Basic education and skills program,Certificate,Other [12],Man,
3,2000,University,Basic education and skills program,Certificate,Other [12],Woman,
4,2000,University,Basic education and skills program,Attestation and other short program credentials,Personal improvement and leisure [0],Man,
...,...,...,...,...,...,...,...
79336,2020,College,Undergraduate program,Degree (includes applied degree),"Business, management and public administration...",Man,0.0
79337,2020,College,Undergraduate program,Degree (includes applied degree),"Business, management and public administration...",Woman,9.0
79338,2020,College,Undergraduate program,Degree (includes applied degree),Physical and life sciences and technologies [6],Man,3.0
79339,2020,College,Undergraduate program,Degree (includes applied degree),Physical and life sciences and technologies [6],Woman,6.0


## Checking for Missing Values

In [285]:
df['Date'].isnull().unique()

array([False])

In [286]:
df['Institution Type'].isnull().unique()

array([False])

In [287]:
df['Program Type'].isnull().unique()

array([False])

In [288]:
df['Credential Type'].isnull().unique()

array([False])

In [289]:
df['Field Of Study'].isnull().unique()

array([False])

In [290]:
df['Gender'].isnull().unique()

array([False])

In [291]:
df['Value'].isna().sum()

27706

In [292]:
df[df['Value'].isna()]

Unnamed: 0,Date,Institution Type,Program Type,Credential Type,Field Of Study,Gender,Value
0,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Man,
1,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Woman,
2,2000,University,Basic education and skills program,Certificate,Other [12],Man,
3,2000,University,Basic education and skills program,Certificate,Other [12],Woman,
4,2000,University,Basic education and skills program,Attestation and other short program credentials,Personal improvement and leisure [0],Man,
...,...,...,...,...,...,...,...
77328,2020,University,Graduate program (second cycle),Degree (includes applied degree),Humanities [3],Woman,
79067,2020,University,Graduate program (second cycle),Diploma,"Agriculture, natural resources and conservatio...",Man,
79068,2020,University,Graduate program (second cycle),Diploma,"Agriculture, natural resources and conservatio...",Woman,
79235,2020,College,Undergraduate program,Degree (includes applied degree),Physical and life sciences and technologies [6],Man,


## Rename gender values into 'Male' and 'Female' for consistency

In [293]:
df.loc[df["Gender"] == "Males", "Gender"] = 'Male'
df.loc[df["Gender"] == "Females", "Gender"] = 'Female'

In [294]:
(df['Value'].isna()).sum()

27706

In [295]:
df

Unnamed: 0,Date,Institution Type,Program Type,Credential Type,Field Of Study,Gender,Value
0,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Man,
1,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Woman,
2,2000,University,Basic education and skills program,Certificate,Other [12],Man,
3,2000,University,Basic education and skills program,Certificate,Other [12],Woman,
4,2000,University,Basic education and skills program,Attestation and other short program credentials,Personal improvement and leisure [0],Man,
...,...,...,...,...,...,...,...
79336,2020,College,Undergraduate program,Degree (includes applied degree),"Business, management and public administration...",Man,0.0
79337,2020,College,Undergraduate program,Degree (includes applied degree),"Business, management and public administration...",Woman,9.0
79338,2020,College,Undergraduate program,Degree (includes applied degree),Physical and life sciences and technologies [6],Man,3.0
79339,2020,College,Undergraduate program,Degree (includes applied degree),Physical and life sciences and technologies [6],Woman,6.0


In [296]:
df['Field Of Study'].unique()

array(['Other  [12]', 'Personal improvement and leisure [0]',
       'Visual and performing arts, and communications technologies [2]',
       'Humanities [3]', 'Social and behavioural sciences and law [4]',
       'Business, management and public administration [5]',
       'Agriculture, natural resources and conservation [9]',
       'Health and related fields [10]',
       'Personal, protective and transportation services [11]',
       'Education [1]', 'Physical and life sciences and technologies [6]',
       'Mathematics, computer and information sciences [7]',
       'Architecture, engineering and related technologies [8]',
       'Unclassified'], dtype=object)

In [297]:
df['Institution Type'].unique()

array(['University', 'College'], dtype=object)

In [298]:
df['Program Type'].unique()

array(['Basic education and skills program',
       'Career, technical or professional training program',
       'Post career, technical or professional training program',
       'Pre-university program', 'Undergraduate qualifying program',
       'Undergraduate program', 'Post-baccalaureate non-graduate program',
       'Health-related residency program',
       'Graduate program (second cycle)',
       'Graduate program (third cycle)',
       'Graduate program (above the third cycle)', 'Other programs',
       'Qualifying program for career, technical or pre-university',
       'Graduate qualifying program (second cycle)'], dtype=object)

In [299]:
df['Credential Type'].unique()

array(['General Equivalency Diploma/high school diploma', 'Certificate',
       'Attestation and other short program credentials',
       'Other type of credential associated with a program', 'Diploma',
       'Degree (includes applied degree)', 'Associate degree'],
      dtype=object)

## Aggregated over Field Of Study, Date, and Gender (Which eliminates NaN values)

In [304]:
grouped = df.groupby(['Date', 'Field Of Study', 'Gender'])['Value'].sum().reset_index()

In [305]:
grouped

Unnamed: 0,Date,Field Of Study,Gender,Value
0,2000,"Agriculture, natural resources and conservatio...",Man,7818.0
1,2000,"Agriculture, natural resources and conservatio...",Woman,6390.0
2,2000,"Architecture, engineering and related technolo...",Man,52596.0
3,2000,"Architecture, engineering and related technolo...",Woman,12987.0
4,2000,"Business, management and public administration...",Man,49608.0
...,...,...,...,...
583,2020,Social and behavioural sciences and law [4],Woman,118131.0
584,2020,Unclassified,Man,5154.0
585,2020,Unclassified,Woman,3426.0
586,2020,"Visual and performing arts, and communications...",Man,16872.0


## Export the updated dataframe to CSV

In [306]:
df.to_csv('data/cleaned_graduates.csv', index=False)