In [1]:
import pandas as pd

In [2]:
from sqlalchemy import create_engine
from config import database
from config import username
from config import password
from config import hostname

In [3]:
# Read in csv
table = "finaldata/Table_1.2.csv"
table_df = pd.read_csv(table)
table_df.head()

Unnamed: 0,Chapter,Group,Sub-group,Sub-group disaggregation,Topic,Topic disaggregation,Current/Trend,Year,Numerator,Denominator,Per cent
0,Overview and demographics,Maternal age,Maternal age,20,Patient election status,Public,Current,2018,5557,203484,2.7
1,Overview and demographics,Maternal age,Maternal age,20,Patient election status,Private,Current,2018,191,83317,0.2
2,Overview and demographics,Maternal age,Maternal age,20,Patient election status,Not stated,Current,2018,13,788,1.6
3,Overview and demographics,Maternal age,Maternal age,24,Patient election status,Public,Current,2018,29944,203484,14.7
4,Overview and demographics,Maternal age,Maternal age,24,Patient election status,Private,Current,2018,2368,83317,2.8


In [4]:
# View all columns for df to decide what to keep
table_df.columns

Index(['Chapter', 'Group', 'Sub-group', 'Sub-group disaggregation', 'Topic',
       'Topic disaggregation', 'Current/Trend', 'Year', 'Numerator',
       'Denominator', 'Per cent'],
      dtype='object')

In [5]:
# Print 
table_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1559 entries, 0 to 1558
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Chapter                   1559 non-null   object 
 1   Group                     1559 non-null   object 
 2   Sub-group                 1559 non-null   object 
 3   Sub-group disaggregation  1559 non-null   int64  
 4   Topic                     1559 non-null   object 
 5   Topic disaggregation      1559 non-null   object 
 6   Current/Trend             1559 non-null   object 
 7   Year                      1559 non-null   int64  
 8   Numerator                 1559 non-null   int64  
 9   Denominator               1559 non-null   int64  
 10  Per cent                  1559 non-null   float64
dtypes: float64(1), int64(4), object(6)
memory usage: 134.1+ KB


In [6]:
# Remove any unnecessary columns from df
columns_list = ['Sub-group disaggregation', 'Topic','Topic disaggregation', 'Year', 'Numerator','Denominator', 'Per cent']


mothers = table_df[columns_list].copy()
mothers


Unnamed: 0,Sub-group disaggregation,Topic,Topic disaggregation,Year,Numerator,Denominator,Per cent
0,20,Patient election status,Public,2018,5557,203484,2.7
1,20,Patient election status,Private,2018,191,83317,0.2
2,20,Patient election status,Not stated,2018,13,788,1.6
3,24,Patient election status,Public,2018,29944,203484,14.7
4,24,Patient election status,Private,2018,2368,83317,2.8
...,...,...,...,...,...,...,...
1554,0,State and territory of birth,WA,2018,0,32974,0.0
1555,0,State and territory of birth,SA,2018,0,19044,0.0
1556,0,State and territory of birth,TAS,2018,0,5436,0.0
1557,0,State and territory of birth,ACT,2018,0,5929,0.0


In [7]:
# Rename columns

mothers_df = mothers.rename(columns={"Sub-group disaggregation" : "age_group", "Topic" : "topic", "Topic disaggregation" : "topic_disaggregation", "Current/Trend" : "current/trend", "Year" : "year", "Numerator" : "count", "Denominator" : "year_total", "Per cent" : "percent_total"})

mothers_df.head(20)

Unnamed: 0,age_group,topic,topic_disaggregation,year,count,year_total,percent_total
0,20,Patient election status,Public,2018,5557,203484,2.7
1,20,Patient election status,Private,2018,191,83317,0.2
2,20,Patient election status,Not stated,2018,13,788,1.6
3,24,Patient election status,Public,2018,29944,203484,14.7
4,24,Patient election status,Private,2018,2368,83317,2.8
5,24,Patient election status,Not stated,2018,78,788,9.9
6,29,Patient election status,Public,2018,59992,203484,29.5
7,29,Patient election status,Private,2018,15739,83317,18.9
8,29,Patient election status,Not stated,2018,207,788,26.3
9,34,Patient election status,Public,2018,67181,203484,33.0


In [8]:
# Check and change data types if required
mothers_df.dtypes

age_group                 int64
topic                    object
topic_disaggregation     object
year                      int64
count                     int64
year_total                int64
percent_total           float64
dtype: object

In [9]:
# Count the total number of woman who gave birth between 2010 and 2018
total_births = mothers_df["count"].sum()
total_births_df = pd.DataFrame ({"Number of Woman who Gave Birth":[total_births]})

# Display the data frame
total_births_df

Unnamed: 0,Number of Woman who Gave Birth
0,14938091


In [10]:
# Group data by age
age = mothers_df.groupby("age_group")

# Find the total count of each age
age_count = age["count"].sum()

# Find the percentage of each age
age_percent = age_count/total_births*100

# # Create a new dataframe to hold calculations for percentage and count of each gender
age_df = pd.DataFrame({"Number of Woman who Gave Birth": age_count, "Percentage of Each Age Group 2010 - 2018": age_percent})

# # Sort values in descedning order 
age_df.sort_values(by=["Number of Woman who Gave Birth"], inplace = True, ascending = False)

# # Convert Percentage of births to float, then format
age_df["Percentage of Each Age Group 2010 - 2018"] = age_df["Percentage of Each Age Group 2010 - 2018"].astype(float).map("{:,.2f}%".format)

# Add index name and sort by Age
age_df.index.name = "age_group"
age_df.sort_values(by=["age_group"], inplace = True, ascending = True)

# # Display the data frame
age_df

Unnamed: 0_level_0,Number of Woman who Gave Birth,Percentage of Each Age Group 2010 - 2018
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2140,0.01%
20,431622,2.89%
24,1897586,12.70%
29,4072689,27.26%
34,5106212,34.18%
39,2791813,18.69%
40,636029,4.26%


In [11]:
# Add Data in to show per year

# Group data by age
age = mothers_df.groupby(["year", "age_group"])

# Find the total count of each age
age_count = age["count"].sum()
age_count
# Find the total count of births per year
year = mothers_df.groupby("year")
year_count = year["count"].sum()
# year_count

# Find the percentage of each age
age_percent = age_count/year_count*100

# # # Create a new dataframe to hold calculations for percentage and count of each gender
age_df = pd.DataFrame({"Number of Woman who Gave Birth": age_count, "Percentage per Year": age_percent})

# # # Sort values in descedning order 
age_df.sort_values(by=["Number of Woman who Gave Birth"], inplace = True, ascending = False)

# # Convert Percentage of births to float, then format
age_df["Percentage per Year"] = age_df["Percentage per Year"].astype(float).map("{:,.2f}%".format)

# Add index name and sort by year then age_group
age_df.sort_values(by=["year", "age_group"], inplace = True, ascending = True)

# # Display the data frame
age_df.tail(60)

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Woman who Gave Birth,Percentage per Year
year,age_group,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,29,402192,27.57%
2010,34,457336,31.35%
2010,39,276113,18.93%
2010,40,59864,4.10%
2011,0,537,0.04%
2011,20,54584,3.72%
2011,24,203216,13.84%
2011,29,410497,27.96%
2011,34,466796,31.79%
2011,39,269533,18.36%


In [12]:
# Average ages per year - REMOVE 0 Values first - 0 was not known information

# Drop any 0's in the age_group column
droped = mothers_df.loc[mothers_df["age_group"]!= "0"]

# Group data by year
year = droped.groupby(["year"])

# Find the average age
age_count = year["age_group"].mean()

# Create a new dataframe to hold calculations for percentage and count of each gender
avg_age = pd.DataFrame({"Average Age": age_count})

# Add index name and sort by year then age_group
avg_age.sort_values(by=["year"], inplace = True, ascending = True)

# # Display the data frame
avg_age

Unnamed: 0_level_0,Average Age
year,Unnamed: 1_level_1
2010,26.571429
2011,27.013333
2012,26.993631
2013,26.484076
2014,26.484076
2015,26.526316
2016,26.484076
2017,26.571429
2018,26.484076


In [13]:
# Total Births Per Year 

# Group data by year
year = mothers_df.groupby("year")

# Find the total sum of births for each year
totalperyear = year["count"].sum()

# Find the percentage of each age
total_percent = totalperyear/total_births*100

# Find the average age
age_count = year["age_group"].mean()

# # Create a new dataframe to hold calculations for percentage and count of each gender
year_df = pd.DataFrame({"Number of Woman who Gave Birth": totalperyear, "Percentage of Each Year": total_percent, "Average Age of Mothers at Birth" : age_count})

# # Sort values in descedning order 
year_df.sort_values(by=["Number of Woman who Gave Birth"], inplace = True, ascending = False)

# # Convert Percentage of births each year to float, and format. Format average age
year_df["Percentage of Each Year"] = year_df["Percentage of Each Year"].astype(float).map("{:,.2f}%".format)
year_df["Average Age of Mothers at Birth"] = year_df["Average Age of Mothers at Birth"].map("{:,.2f}".format)

# Add index name and sort by year then age_group
year_df.sort_values(by=["year"], inplace = True, ascending = True)

# # Display the data frame
year_df

Unnamed: 0_level_0,Number of Woman who Gave Birth,Percentage of Each Year,Average Age of Mothers at Birth
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,1458840,9.77%,26.57
2011,1468363,9.83%,27.01
2012,1518763,10.17%,26.99
2013,1506039,10.08%,26.48
2014,1522611,10.19%,26.48
2015,1504623,10.07%,26.53
2016,1534145,10.27%,26.48
2017,1485389,9.94%,26.57
2018,2939318,19.68%,26.48


In [14]:
# Drop any 0's in the age_group column
droped = mothers_df.loc[mothers_df["age_group"]!= "0"]

# Group data by year
year = droped.groupby(["year"])

# Find the average age
age_count = year["age_group"].mean()

# Create a new dataframe to hold calculations for percentage and count of each gender
avg_age = pd.DataFrame({"Average Age": age_count})

# Add index name and sort by year then age_group
avg_age.sort_values(by=["year"], inplace = True, ascending = True)

# # Display the data frame
avg_age

Unnamed: 0_level_0,Average Age
year,Unnamed: 1_level_1
2010,26.571429
2011,27.013333
2012,26.993631
2013,26.484076
2014,26.484076
2015,26.526316
2016,26.484076
2017,26.571429
2018,26.484076


In [15]:
# Count the total number of woman who gave birth between 2010 and 2018 in either private or public hospitals
dropped = mothers_df.loc[mothers_df["topic"]== "Hospital sector"]
dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# Group by private or public
birth_locations = dropped.groupby(["topic_disaggregation"])

birth_locations_count = birth_locations["count"].sum()

birth_locations_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_locations_count})

# Display the data frame
birth_locations_df

Unnamed: 0_level_0,Number of Woman Who Gave Birth
topic_disaggregation,Unnamed: 1_level_1
Private,800544
Public,2134144


In [16]:
# Count the total number of woman who gave birth between 2010 and 2018 each year in either private or public hospitals
dropped = mothers_df.loc[mothers_df["topic"]== "Hospital sector"]
dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# Group by private or public
birth_locations = dropped.groupby(["year", "topic_disaggregation"])

birth_locations_count = birth_locations["count"].sum()

birth_locations_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_locations_count})

# Display the data frame
birth_locations_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Woman Who Gave Birth
year,topic_disaggregation,Unnamed: 2_level_1
2010,Private,85404
2010,Public,200833
2011,Private,83699
2011,Public,204481
2012,Private,86471
2012,Public,211612
2013,Private,83516
2013,Public,213095
2014,Private,82366
2014,Public,217916


In [17]:
# Count the total number of indigenous woman who gave birth between 2010 and 2018
drop = mothers_df.loc[mothers_df["topic"]== "Indigenous status (mother)"]
# dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# Group by Indigenous Status
indigenous = drop.groupby(["topic_disaggregation"])

indigenous_status = indigenous["count"].sum()

indigenous_df = pd.DataFrame ({"Number of Indigenous Woman Who Gave Birth":indigenous_status})

# Display the data frame
indigenous_df

Unnamed: 0_level_0,Number of Indigenous Woman Who Gave Birth
topic_disaggregation,Unnamed: 1_level_1
Indigenous,128386
Non-Indigenous,2890536
Not stated,6937


In [18]:
# Count the total number of indigenous woman who gave birth between 2010 and 2018 per year
drop = mothers_df.loc[mothers_df["topic"]== "Indigenous status (mother)"]

# Group by year and Indigenous Status
indigenous_year = drop.groupby(["year", "topic_disaggregation"])

indigenous_status_year = indigenous_year["count"].sum()

indigenous_year_df = pd.DataFrame ({"Number of Indigenous Woman Who Gave Birth":indigenous_status_year})

# Display the data frame
indigenous_year_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Indigenous Woman Who Gave Birth
year,topic_disaggregation,Unnamed: 2_level_1
2010,Indigenous,11488
2010,Non-Indigenous,283172
2010,Not stated,795
2011,Indigenous,11731
2011,Non-Indigenous,284909
2011,Not stated,703
2012,Indigenous,12286
2012,Non-Indigenous,294619
2012,Not stated,665
2013,Indigenous,12380


In [19]:
# Count the total number of woman who gave birth in each state between 2010 and 2018
statedropped = mothers_df.loc[mothers_df["topic"]== "State and territory of birth"]
# dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# Group by state
birth_state = statedropped.groupby(["topic_disaggregation"])

birth_state_count = birth_state["count"].sum()

birth_state_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_state_count})

# Display the data frame
birth_state_df

Unnamed: 0_level_0,Number of Woman Who Gave Birth
topic_disaggregation,Unnamed: 1_level_1
ACT,60761
NSW,955446
NT,38800
QLD,611176
SA,197279
TAS,57653
VIC,770304
WA,334440


In [20]:
# Count the total number of woman who gave birth in each state between 2010 and 2018
statedropped = mothers_df.loc[mothers_df["topic"]== "State and territory of birth"]
# dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# Group by year and state
birth_state = statedropped.groupby(["year", "topic_disaggregation"])

birth_state_count = birth_state["count"].sum()

birth_state_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_state_count})

# Display the data frame
birth_state_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Number of Woman Who Gave Birth
year,topic_disaggregation,Unnamed: 2_level_1
2010,ACT,5826
2010,NSW,94993
2010,NT,3830
2010,QLD,61020
2010,SA,19666
...,...,...
2018,QLD,119248
2018,SA,38088
2018,TAS,10872
2018,VIC,155466


In [21]:
# Read in csv for Mothers Birth Country
mothersbirth = "finaldata/Table_4.1.csv"
mothersbirth_df = pd.read_csv(mothersbirth)
mothersbirth_df.head()

Unnamed: 0,babies_term,topic,topic_disaggregation,year,count,year_total,percent_total
0,Pre-term,Admission to SCN/NICU,Admitted,2018,11839,31838,37.2
1,Pre-term,Admission to SCN/NICU,Not admitted,2018,2945,139691,2.1
2,Pre-term,Admission to SCN/NICU,Not stated,2018,33,1227,2.7
3,Term,Admission to SCN/NICU,Admitted,2018,19917,31838,62.6
4,Term,Admission to SCN/NICU,Not admitted,2018,136360,139691,97.6


In [22]:
# View all columns for df to decide what to keep
mothersbirth_df.columns

Index(['babies_term', 'topic', 'topic_disaggregation', 'year', 'count',
       'year_total', 'percent_total'],
      dtype='object')

In [23]:
# Print datatypes
mothersbirth_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2646 entries, 0 to 2645
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   babies_term           2646 non-null   object 
 1   topic                 2646 non-null   object 
 2   topic_disaggregation  2646 non-null   object 
 3   year                  2646 non-null   int64  
 4   count                 2646 non-null   int64  
 5   year_total            2646 non-null   int64  
 6   percent_total         2646 non-null   float64
dtypes: float64(1), int64(3), object(3)
memory usage: 144.8+ KB


In [24]:
# Drop any values's in the topic column that don't relate to the mothers birth country
dropped = mothersbirth_df.loc[mothersbirth_df["topic"]== "Country of birth (mother)"]

# Group data by maternal country of birth
countryofbirth = dropped.groupby("topic_disaggregation")

# Find the total count of each mother per country
countofmothers = countryofbirth["count"].sum()

# Create a new dataframe to hold calculations for percentage and count of each gender
countofmothers_df = pd.DataFrame({"number_women_gave_birth": countofmothers})


# Add index name and sort by Age
countofmothers_df.index.name = "maternal_birth_country"


# Display the data frame
countofmothers_df

Unnamed: 0_level_0,number_women_gave_birth
maternal_birth_country,Unnamed: 1_level_1
Born in Australia,2061860
Born overseas,994962
Not stated,15132


In [25]:
# Drop any values's in the topic column that don't relate to the mothers birth country
dropped = mothersbirth_df.loc[mothersbirth_df["topic"]== "Country of birth (mother)"]

# Group data by maternal country of birth and year
countryofbirth = dropped.groupby(["year", "topic_disaggregation"])

# Find the total count of each mother per country
countofmothers = countryofbirth["count"].sum()

# Create a new dataframe to hold calculations for percentage and count of each gender
countofmothers_df = pd.DataFrame({"number_women_gave_birth": countofmothers})

# Add index name and sort by Age
countofmothers_df.index.name = "maternal_birth_country"


# Display the data frame
countofmothers_df

Unnamed: 0_level_0,Unnamed: 1_level_0,number_women_gave_birth
year,topic_disaggregation,Unnamed: 2_level_1
2010,Born in Australia,214457
2010,Born overseas,83996
2010,Not stated,1762
2011,Born in Australia,212860
2011,Born overseas,87697
2011,Not stated,1468
2012,Born in Australia,215009
2012,Born overseas,95541
2012,Not stated,1701
2013,Born in Australia,211905


In [26]:
# Read in csv for birth location
birthlocation = "finaldata/Table_3.1.csv"
birthlocation_df = pd.read_csv(birthlocation)
birthlocation_df.head()

Unnamed: 0,place_of_birth,topic,topic_disaggregation,year,count,year_total,percent_total
0,Hospital,Country of birth (mother),Born in Australia,2018,184638,192172,96.1
1,Hospital,Country of birth (mother),Born overseas,2018,102141,105383,96.9
2,Hospital,Country of birth (mother),Not stated,2018,810,1075,75.3
3,Birth centre,Country of birth (mother),Born in Australia,2018,5631,192172,2.9
4,Birth centre,Country of birth (mother),Born overseas,2018,2552,105383,2.4


In [27]:
# Print datatypes
birthlocation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730 entries, 0 to 1729
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   place_of_birth        1730 non-null   object 
 1   topic                 1730 non-null   object 
 2   topic_disaggregation  1730 non-null   object 
 3   year                  1730 non-null   int64  
 4   count                 1730 non-null   int64  
 5   year_total            1730 non-null   int64  
 6   percent_total         1730 non-null   float64
dtypes: float64(1), int64(3), object(3)
memory usage: 94.7+ KB


In [28]:
# Isolate one set of data, as data is replicated across multiple topics 
placeofbirthfilter = birthlocation_df.loc[birthlocation_df["topic"]== "Country of birth (mother)"]

# Find the total count of each mother that gave birth in different locations
totalcount = placeofbirthfilter["count"].sum()
totalcount

3025859

In [29]:
# Group data by place of birth
placeofbirth = placeofbirthfilter.groupby("place_of_birth")

# Find the total count of each mother per place of birth
countofbirthplace = placeofbirth["count"].sum()

# # Create a new dataframe to hold the number of women who gave birth per place of birth
countofbirthplace_df = pd.DataFrame({"number_of_women_per_birthplace": countofbirthplace})

# Display the data frame
countofbirthplace_df

Unnamed: 0_level_0,number_of_women_per_birthplace
place_of_birth,Unnamed: 1_level_1
Birth centre,66344
Home,10071
Hospital,2934691
Not stated,751
Other,14002


In [30]:
# Group data by year
yearfilter = placeofbirthfilter.groupby(["year"])

# Find the total count of each mother that gave birth in different locations each year
totalyearcount = yearfilter["count"].sum()

# Create a new dataframe to hold the number of women who gave birth per place of birth
countofyear_df = pd.DataFrame({"number_of_women_per_year": totalyearcount})
countofyear_df

Unnamed: 0_level_0,number_of_women_per_year
year,Unnamed: 1_level_1
2010,295455
2011,297343
2012,307570
2013,304777
2014,307844
2015,304268
2016,310247
2017,301095
2018,597260


In [31]:
# Group data by place of birth and year
placeofbirth = placeofbirthfilter.groupby(["year", "place_of_birth"])

# Find the total count of each mother per place of birth
countofbirthplace = placeofbirth["count"].sum()

# # Create a new dataframe to hold the number of women who gave birth per place of birth
countofbirthplace_df = pd.DataFrame({"number_of_women_per_birthplace": countofbirthplace})

# Display the data frame
countofbirthplace_df

Unnamed: 0_level_0,Unnamed: 1_level_0,number_of_women_per_birthplace
year,place_of_birth,Unnamed: 2_level_1
2010,Birth centre,6361
2010,Home,1281
2010,Hospital,286238
2010,Not stated,288
2010,Other,1287
2011,Birth centre,6614
2011,Home,1223
2011,Hospital,288182
2011,Not stated,202
2011,Other,1122


In [32]:
# Connect to postgres database
connection_string = (f'{username}:{password}@{hostname}:5432/mothersandbabies')
engine = create_engine(f'postgresql://{connection_string}')

In [33]:
# Check available tables in postgres database
engine.table_names()

['mothers', 'mothersbirthcountry', 'birthlocation']

In [34]:
# Load panda's dataframe to postgres sql table
mothers_df.to_sql(name='mothers', con=engine, if_exists='append', index=True)
mothersbirth_df.to_sql(name='mothersbirthcountry', con=engine, if_exists='append', index=True)
birthlocation_df.to_sql(name='birthlocation', con=engine, if_exists='append', index=True)