In [1]:
import pandas as pd

In [2]:
from sqlalchemy import create_engine
from config import database
from config import username
from config import password
from config import hostname

In [3]:
# Read in csv
babies = "finaldata/Table_4.1.csv"
babies_df = pd.read_csv(babies)
babies_df.head()

Unnamed: 0,babies_term,topic,topic_disaggregation,year,count,year_total,percent_total
0,Pre-term,Admission to SCN/NICU,Admitted,2018,11839,31838,37.2
1,Pre-term,Admission to SCN/NICU,Not admitted,2018,2945,139691,2.1
2,Pre-term,Admission to SCN/NICU,Not stated,2018,33,1227,2.7
3,Term,Admission to SCN/NICU,Admitted,2018,19917,31838,62.6
4,Term,Admission to SCN/NICU,Not admitted,2018,136360,139691,97.6


In [4]:
# View all columns for df to decide what to keep
babies_df.columns

Index(['babies_term', 'topic', 'topic_disaggregation', 'year', 'count',
       'year_total', 'percent_total'],
      dtype='object')

In [5]:
# Print 
babies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2646 entries, 0 to 2645
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   babies_term           2646 non-null   object 
 1   topic                 2646 non-null   object 
 2   topic_disaggregation  2646 non-null   object 
 3   year                  2646 non-null   int64  
 4   count                 2646 non-null   int64  
 5   year_total            2646 non-null   int64  
 6   percent_total         2646 non-null   float64
dtypes: float64(1), int64(3), object(3)
memory usage: 144.8+ KB


In [6]:
# Check and change data types if required
babies_df.dtypes

babies_term              object
topic                    object
topic_disaggregation     object
year                      int64
count                     int64
year_total                int64
percent_total           float64
dtype: object

In [8]:
# Connect to postgres database
connection_string = (f'{username}:{password}@{hostname}:5432/mothersandbabies')
engine = create_engine(f'postgresql://{connection_string}')

In [14]:
# Check available tables in postgres database
engine.table_names()

['babies',
 'numberchildrenadopted',
 'adoptionsbytype',
 'mothers',
 'mothersbirthcountry',
 'birthlocation',
 'ageofbirthmother',
 'adoptionbyageandgender',
 'parentrelationship',
 'adoptiveparentrelationship',
 'typeofadoption',
 'babiessexcount',
 'intercountry_bycountryoforigin',
 'intercountry_byagegroup',
 'intercountry_bysiblinggroup',
 'adoptionprocesstime',
 'yearcount',
 'averageage',
 'birthstate',
 'termbabiescount']

In [15]:
# Load panda's dataframe to postgres sql table
babies_df.to_sql(name='babies', con=engine, if_exists='append', index=True)

In [10]:
# Count the total number of babies born per year based on term and sex
dropped = babies_df.loc[babies_df["topic"]== "Sex"]
dropped = dropped.loc[dropped["topic_disaggregation"]!= "Indeterminate/Not stated"]

birth_term = dropped.groupby(["year", "babies_term", "topic_disaggregation"])

babies_count = birth_term["count"].sum()

babies_count_df = pd.DataFrame ({"babies_count":babies_count})

# Display the data frame
babies_count_df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,babies_count
year,babies_term,topic_disaggregation,Unnamed: 3_level_1
2010,Not stated,Female,20
2010,Not stated,Male,43
2010,Post-term,Female,1125
2010,Post-term,Male,1235
2010,Pre-term,Female,11455
2010,Pre-term,Male,13273
2010,Term,Female,133861
2010,Term,Male,138999
2011,Not stated,Female,31
2011,Not stated,Male,36


In [12]:
# Count the total number of babies born per year based on term and sex
dropped = babies_df.loc[babies_df["topic"]== "Sex"]
dropped = dropped.loc[dropped["topic_disaggregation"]!= "Indeterminate/Not stated"]

birth_term = dropped.groupby(["topic_disaggregation", "babies_term"])

babies_count = birth_term["count"].sum()

babies_count_df2 = pd.DataFrame ({"babies_count":babies_count})

# Display the data frame
babies_count_df2.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,babies_count
topic_disaggregation,babies_term,Unnamed: 2_level_1
Female,Not stated,500
Female,Post-term,7924
Female,Pre-term,121207
Female,Term,1362081
Male,Not stated,552
Male,Post-term,8586
Male,Pre-term,140954
Male,Term,1428777


In [8]:
# Count the total number of babies born per year
peryear = babies_df.groupby(["year"])

babies_yearcount = peryear["count"].sum()

babies_yearcount_df = pd.DataFrame ({"babies_count":babies_yearcount})

# Display the data frame
babies_yearcount_df.head(20)

Unnamed: 0_level_0,babies_count
year,Unnamed: 1_level_1
2010,3760519
2011,3780238
2012,5045193
2013,5010371
2014,5062028
2015,5001371
2016,5096228
2017,4933955
2018,9862276


In [14]:
# Count the total number of babies born per year based on plurality
plural = babies_df.loc[babies_df["topic"]== "Plurality"]
pluraldropped = plural.loc[plural["topic_disaggregation"]!= "Not stated"]

byyear = pluraldropped.groupby(["year", "topic_disaggregation"])

plural_count = byyear["count"].sum()

plural_count_df = pd.DataFrame ({"babies_count":plural_count})

# Display the data frame
plural_count_df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,babies_count
year,topic_disaggregation,Unnamed: 2_level_1
2010,Other multiples,222
2010,Singleton,290773
2010,Twins,9220
2011,Other multiples,228
2011,Singleton,292736
2011,Twins,9060
2012,Other multiples,191
2012,Singleton,301025
2012,Twins,9062
2013,Other multiples,249


In [13]:
# Export file as a CSV, without the Pandas index, with the header
plural_count_df.to_csv("../machine_learning/termbabiescount.csv", index=False, header=True)

In [15]:
# Load panda's dataframe to postgres sql table
babies_count_df2.to_sql(name='babiessexcount', con=engine, if_exists='append', index=True)

In [29]:
# Load panda's dataframe to postgres sql table
babies_count_df.to_sql(name='termbabiescount', con=engine, if_exists='append', index=True)

In [30]:
# Export file as a CSV, without the Pandas index, with the header
babies_count_df.to_csv("../machine_learning/termbabiescount.csv", index=False, header=True)

In [None]:
# # Count the total number of woman who gave birth between 2010 and 2018
# total_births = mothers_df["count"].sum()
# total_births_df = pd.DataFrame ({"Number of Woman who Gave Birth":[total_births]})

# # Display the data frame
# total_births_df

In [None]:
# # Group data by age
# age = mothers_df.groupby("age_group")

# # Find the total count of each age
# age_count = age["count"].sum()

# # Find the percentage of each age
# age_percent = age_count/total_births*100

# # # Create a new dataframe to hold calculations for percentage and count of each gender
# age_df = pd.DataFrame({"Number of Woman who Gave Birth": age_count, "Percentage of Each Age Group 2010 - 2018": age_percent})

# # # Sort values in descedning order 
# age_df.sort_values(by=["Number of Woman who Gave Birth"], inplace = True, ascending = False)

# # # Convert Percentage of births to float, then format
# age_df["Percentage of Each Age Group 2010 - 2018"] = age_df["Percentage of Each Age Group 2010 - 2018"].astype(float).map("{:,.2f}%".format)

# # Add index name and sort by Age
# age_df.index.name = "age_group"
# age_df.sort_values(by=["age_group"], inplace = True, ascending = True)

# # # Display the data frame
# age_df

In [None]:
# # Add Data in to show per year

# # Group data by age
# age = mothers_df.groupby(["year", "age_group"])

# # Find the total count of each age
# age_count = age["count"].sum()
# age_count
# # Find the total count of births per year
# year = mothers_df.groupby("year")
# year_count = year["count"].sum()
# # year_count

# # Find the percentage of each age
# age_percent = age_count/year_count*100

# # # # Create a new dataframe to hold calculations for percentage and count of each gender
# age_df = pd.DataFrame({"Number of Woman who Gave Birth": age_count, "Percentage per Year": age_percent})

# # # # Sort values in descedning order 
# age_df.sort_values(by=["Number of Woman who Gave Birth"], inplace = True, ascending = False)

# # # Convert Percentage of births to float, then format
# age_df["Percentage per Year"] = age_df["Percentage per Year"].astype(float).map("{:,.2f}%".format)

# # Add index name and sort by year then age_group
# age_df.sort_values(by=["year", "age_group"], inplace = True, ascending = True)

# # # Display the data frame
# age_df.tail(60)

In [None]:
# # Average ages per year - REMOVE 0 Values first - 0 was not known information

# # Drop any 0's in the age_group column
# droped = mothers_df.loc[mothers_df["age_group"]!= "0"]

# # Group data by year
# year = droped.groupby(["year"])

# # Find the average age
# age_count = year["age_group"].mean()

# # Create a new dataframe to hold calculations for percentage and count of each gender
# avg_age = pd.DataFrame({"Average Age": age_count})

# # Add index name and sort by year then age_group
# avg_age.sort_values(by=["year"], inplace = True, ascending = True)

# # # Display the data frame
# avg_age

In [None]:
# # Total Births Per Year 

# # Group data by year
# year = mothers_df.groupby("year")

# # Find the total sum of births for each year
# totalperyear = year["count"].sum()

# # Find the percentage of each age
# total_percent = totalperyear/total_births*100

# # Find the average age
# age_count = year["age_group"].mean()

# # # Create a new dataframe to hold calculations for percentage and count of each gender
# year_df = pd.DataFrame({"Number of Woman who Gave Birth": totalperyear, "Percentage of Each Year": total_percent, "Average Age of Mothers at Birth" : age_count})

# # # Sort values in descedning order 
# year_df.sort_values(by=["Number of Woman who Gave Birth"], inplace = True, ascending = False)

# # # Convert Percentage of births each year to float, and format. Format average age
# year_df["Percentage of Each Year"] = year_df["Percentage of Each Year"].astype(float).map("{:,.2f}%".format)
# year_df["Average Age of Mothers at Birth"] = year_df["Average Age of Mothers at Birth"].map("{:,.2f}".format)

# # Add index name and sort by year then age_group
# year_df.sort_values(by=["year"], inplace = True, ascending = True)

# # # Display the data frame
# year_df

In [None]:
# # Drop any 0's in the age_group column
# droped = mothers_df.loc[mothers_df["age_group"]!= "0"]

# # Group data by year
# year = droped.groupby(["year"])

# # Find the average age
# age_count = year["age_group"].mean()

# # Create a new dataframe to hold calculations for percentage and count of each gender
# avg_age = pd.DataFrame({"Average Age": age_count})

# # Add index name and sort by year then age_group
# avg_age.sort_values(by=["year"], inplace = True, ascending = True)

# # # Display the data frame
# avg_age

In [None]:
# # Count the total number of woman who gave birth between 2010 and 2018 in either private or public hospitals
# dropped = mothers_df.loc[mothers_df["topic"]== "Hospital sector"]
# dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# # Group by private or public
# birth_locations = dropped.groupby(["topic_disaggregation"])

# birth_locations_count = birth_locations["count"].sum()

# birth_locations_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_locations_count})

# # Display the data frame
# birth_locations_df

In [None]:
# # Count the total number of woman who gave birth between 2010 and 2018 each year in either private or public hospitals
# dropped = mothers_df.loc[mothers_df["topic"]== "Hospital sector"]
# dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# # Group by private or public
# birth_locations = dropped.groupby(["year", "topic_disaggregation"])

# birth_locations_count = birth_locations["count"].sum()

# birth_locations_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_locations_count})

# # Display the data frame
# birth_locations_df

In [None]:
# # Count the total number of indigenous woman who gave birth between 2010 and 2018
# drop = mothers_df.loc[mothers_df["topic"]== "Indigenous status (mother)"]
# # dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# # Group by Indigenous Status
# indigenous = drop.groupby(["topic_disaggregation"])

# indigenous_status = indigenous["count"].sum()

# indigenous_df = pd.DataFrame ({"Number of Indigenous Woman Who Gave Birth":indigenous_status})

# # Display the data frame
# indigenous_df

In [None]:
# # Count the total number of indigenous woman who gave birth between 2010 and 2018 per year
# drop = mothers_df.loc[mothers_df["topic"]== "Indigenous status (mother)"]

# # Group by year and Indigenous Status
# indigenous_year = drop.groupby(["year", "topic_disaggregation"])

# indigenous_status_year = indigenous_year["count"].sum()

# indigenous_year_df = pd.DataFrame ({"Number of Indigenous Woman Who Gave Birth":indigenous_status_year})

# # Display the data frame
# indigenous_year_df

In [None]:
# # Count the total number of woman who gave birth in each state between 2010 and 2018
# statedropped = mothers_df.loc[mothers_df["topic"]== "State and territory of birth"]
# # dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# # Group by state
# birth_state = statedropped.groupby(["topic_disaggregation"])

# birth_state_count = birth_state["count"].sum()

# birth_state_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_state_count})

# # Display the data frame
# birth_state_df

In [None]:
# # Count the total number of woman who gave birth in each state between 2010 and 2018
# statedropped = mothers_df.loc[mothers_df["topic"]== "State and territory of birth"]
# # dropped = dropped.loc[dropped["topic_disaggregation"]!= "Not stated"]

# # Group by year and state
# birth_state = statedropped.groupby(["year", "topic_disaggregation"])

# birth_state_count = birth_state["count"].sum()

# birth_state_df = pd.DataFrame ({"Number of Woman Who Gave Birth":birth_state_count})

# # Display the data frame
# birth_state_df

In [None]:
# # Read in csv for Mothers Birth Country
# mothersbirth = "finaldata/Table_4.1.csv"
# mothersbirth_df = pd.read_csv(mothersbirth)
# mothersbirth_df.head()

In [None]:
# # View all columns for df to decide what to keep
# mothersbirth_df.columns

In [None]:
# # Print datatypes
# mothersbirth_df.info()

In [None]:
# # Drop any values's in the topic column that don't relate to the mothers birth country
# dropped = mothersbirth_df.loc[mothersbirth_df["topic"]== "Country of birth (mother)"]

# # Group data by maternal country of birth
# countryofbirth = dropped.groupby("topic_disaggregation")

# # Find the total count of each mother per country
# countofmothers = countryofbirth["count"].sum()

# # Create a new dataframe to hold calculations for percentage and count of each gender
# countofmothers_df = pd.DataFrame({"number_women_gave_birth": countofmothers})


# # Add index name and sort by Age
# countofmothers_df.index.name = "maternal_birth_country"


# # Display the data frame
# countofmothers_df

In [None]:
# # Drop any values's in the topic column that don't relate to the mothers birth country
# dropped = mothersbirth_df.loc[mothersbirth_df["topic"]== "Country of birth (mother)"]

# # Group data by maternal country of birth and year
# countryofbirth = dropped.groupby(["year", "topic_disaggregation"])

# # Find the total count of each mother per country
# countofmothers = countryofbirth["count"].sum()

# # Create a new dataframe to hold calculations for percentage and count of each gender
# countofmothers_df = pd.DataFrame({"number_women_gave_birth": countofmothers})

# # Add index name and sort by Age
# countofmothers_df.index.name = "maternal_birth_country"


# # Display the data frame
# countofmothers_df

In [None]:
# # Read in csv for birth location
# birthlocation = "finaldata/Table_3.1.csv"
# birthlocation_df = pd.read_csv(birthlocation)
# birthlocation_df.head()

In [None]:
# # Print datatypes
# birthlocation_df.info()

In [None]:
# # Isolate one set of data, as data is replicated across multiple topics 
# placeofbirthfilter = birthlocation_df.loc[birthlocation_df["topic"]== "Country of birth (mother)"]

# # Find the total count of each mother that gave birth in different locations
# totalcount = placeofbirthfilter["count"].sum()
# totalcount

In [None]:
# # Group data by place of birth
# placeofbirth = placeofbirthfilter.groupby("place_of_birth")

# # Find the total count of each mother per place of birth
# countofbirthplace = placeofbirth["count"].sum()

# # # Create a new dataframe to hold the number of women who gave birth per place of birth
# countofbirthplace_df = pd.DataFrame({"number_of_women_per_birthplace": countofbirthplace})

# # Display the data frame
# countofbirthplace_df

In [None]:
# # Group data by year
# yearfilter = placeofbirthfilter.groupby(["year"])

# # Find the total count of each mother that gave birth in different locations each year
# totalyearcount = yearfilter["count"].sum()

# # Create a new dataframe to hold the number of women who gave birth per place of birth
# countofyear_df = pd.DataFrame({"number_of_women_per_year": totalyearcount})
# countofyear_df