In [130]:
# Data Staging for graduates Data

In [131]:
## Importing Libraries and Reading the Data

In [132]:
import pandas as pd

df = pd.read_csv('data/raw_graduates.csv')

In [133]:
## Previewing the Dataset

In [134]:
df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Institution type,Program type,Credential type,Field of study,Gender,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study","Total, gender",Number,223,units,0,v79575621,1.1.1.1.1.1,235932.0,,,,0
1,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Man,Number,223,units,0,v79575622,1.1.1.1.1.2,101733.0,,,,0
2,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Woman,Number,223,units,0,v79575623,1.1.1.1.1.3,134196.0,,,,0
3,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Gender unknown,Number,223,units,0,v79575624,1.1.1.1.1.4,,..,,,0
4,1992,Canada,2016A000011124,"Total, institution type","Total, program type","Total, credential type",Personal improvement and leisure [0],"Total, gender",Number,223,units,0,v79575625,1.1.1.1.2.1,12.0,,,,0


## Unique Values in Each Column

In [135]:
unique_values = df[['GEO', 'DGUID', 'UOM','Institution type', 'Program type', 'Credential type', 'Field of study', 'Gender', 'UOM_ID','SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR' , 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS']].apply(lambda x: x.unique())
unique_values

GEO                 [Canada, Newfoundland and Labrador, Prince Edw...
DGUID               [2016A000011124, 2016A000210, 2016A000211, 201...
UOM                                                          [Number]
Institution type       [Total, institution type, University, College]
Program type        [Total, program type, Basic education and skil...
Credential type     [Total, credential type, General Equivalency D...
Field of study      [Total, field of study, Personal improvement a...
Gender                    [Total, gender, Man, Woman, Gender unknown]
UOM_ID                                                          [223]
SCALAR_FACTOR                                                 [units]
SCALAR_ID                                                         [0]
VECTOR              [v79575621, v79575622, v79575623, v79575624, v...
COORDINATE          [1.1.1.1.1.1, 1.1.1.1.1.2, 1.1.1.1.1.3, 1.1.1....
VALUE               [235932.0, 101733.0, 134196.0, nan, 12.0, 0.0,...
STATUS              

## Dropping Unnecessary Columns

In [136]:
df = df.drop(columns=['GEO', 'DGUID', 'UOM', 'UOM_ID', 'SCALAR_FACTOR',  'SCALAR_ID', 'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'SYMBOL', 'TERMINATED', 'DECIMALS'])

In [137]:
df.head()

Unnamed: 0,REF_DATE,Institution type,Program type,Credential type,Field of study,Gender,VALUE
0,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study","Total, gender",235932.0
1,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Man,101733.0
2,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Woman,134196.0
3,1992,"Total, institution type","Total, program type","Total, credential type","Total, field of study",Gender unknown,
4,1992,"Total, institution type","Total, program type","Total, credential type",Personal improvement and leisure [0],"Total, gender",12.0


## Renaming Columns for Clarity and Simplicity

In [138]:
df = df.rename(columns={'REF_DATE': 'Date'})

In [139]:
df.dtypes

Date                  int64
Institution type     object
Program type         object
Credential type      object
Field of study       object
Gender               object
VALUE               float64
dtype: object

## Remove any totals to be as detailed as possible

In [140]:
# Function to check if a record contains any totals
def check_total(record):
    return record.split(', ')[0] == 'Total'

In [141]:
# Remove total gender
updated_gender_df = df.drop(df[df['Gender'].apply(check_total)].index)

# Remove total credential type
updated_credential_df = updated_gender_df.drop(updated_gender_df[updated_gender_df['Credential type'].apply(check_total)].index)

# Remove total program type
updated_program_df = updated_credential_df.drop(updated_credential_df[updated_credential_df['Program type'].apply(check_total)].index)

# Remove total institution type
updated_institution_df = updated_program_df.drop(updated_program_df[updated_program_df['Institution type'].apply(check_total)].index)

# Remove total field of study
updated_field_df = updated_institution_df.drop(updated_institution_df[updated_institution_df['Field of study'].apply(check_total)].index)


updated_field_df.reset_index(drop=True, inplace=True)

updated_field_df

Unnamed: 0,Date,Institution type,Program type,Credential type,Field of study,Gender,VALUE
0,1992,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Man,
1,1992,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Woman,
2,1992,University,Basic education and skills program,Certificate,Other [12],Man,
3,1992,University,Basic education and skills program,Certificate,Other [12],Woman,
4,1992,University,Basic education and skills program,Attestation and other short program credentials,Personal improvement and leisure [0],Man,
...,...,...,...,...,...,...,...
130103,2021,College,Undergraduate program,Diploma,"Agriculture, natural resources and conservatio...",Woman,0.0
130104,2021,College,Other programs,Certificate,Education [1],Man,3.0
130105,2021,College,Other programs,Certificate,Education [1],Woman,0.0
130106,2021,College,Other programs,Diploma,"Business, management and public administration...",Man,3.0


## Filtering Date to Start from 2000

In [142]:
df = updated_field_df[updated_field_df['Date'] >= 2000]
df.reset_index(drop=True, inplace=True)

In [143]:
df

Unnamed: 0,Date,Institution type,Program type,Credential type,Field of study,Gender,VALUE
0,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Man,
1,2000,University,Basic education and skills program,General Equivalency Diploma/high school diploma,Other [12],Woman,
2,2000,University,Basic education and skills program,Certificate,Other [12],Man,
3,2000,University,Basic education and skills program,Certificate,Other [12],Woman,
4,2000,University,Basic education and skills program,Attestation and other short program credentials,Personal improvement and leisure [0],Man,
...,...,...,...,...,...,...,...
93473,2021,College,Undergraduate program,Diploma,"Agriculture, natural resources and conservatio...",Woman,0.0
93474,2021,College,Other programs,Certificate,Education [1],Man,3.0
93475,2021,College,Other programs,Certificate,Education [1],Woman,0.0
93476,2021,College,Other programs,Diploma,"Business, management and public administration...",Man,3.0


## Checking for Missing Values

In [144]:
df['Date'].isnull().unique()

array([False])

In [145]:
df['Institution type'].isnull().unique()

array([False])

In [146]:
df['Program type'].isnull().unique()

array([False])

In [147]:
df['Credential type'].isnull().unique()

array([False])

In [148]:
df['Field of study'].isnull().unique()

array([False])

In [149]:
df['Gender'].isnull().unique()

array([False])

In [160]:
df['VALUE'].isna().sum()

34452

## 

## Export the updated dataframe to CSV

In [161]:
# updated_df.to_csv('graduates.csv', index=False)