# Australian Marriage Law Survey Data Normalization & Export to SQL

* We first Normalize the Data before importing the data into MSSQL Server
* We note that a simple data model is created before being imported into MSSQL Server
* This project is a work in progress - as more data is found to make the analysis more robust

In [1]:
import pandas as pd
import numpy as np
import sqlalchemy

In [2]:
df_part = pd.read_csv('../data/aus_participation.csv')
df_part_gend = pd.read_csv('../data/aus_participation_gendered.csv')
df_resp = pd.read_csv('../data/aus_response.csv')

# Normalization

In [3]:
# let's start with something simple : Table of states
states_list = list(df_part['State'].value_counts().index)
States = pd.DataFrame({'state' : states_list})

In [4]:
# Then, let's move on to Division. 
# Division is actually already normalized, but has redundant data
Division = (df_resp.copy()
            .drop(['vote_yes_qty', 
                   'vote_no_qty', 
                   'vote_no_pct'], 
                    axis=1)
            .rename(columns={'area': 'div_name', 
                                    'vote_yes_pct': 'approval_pct',
                                    'vote_total_qty':'vote_qty',
                                    'State':'state'})
           )
Division['vote_qty'] = Division['vote_qty'].astype(int)

In [5]:
# Now, we can actually move on to participations

# First, get all the relevant data for age and gender not defined.
ndf = 'Age or gender ndf'
df_age_gender_ndf = df_part[df_part['age_category'] == ndf].copy()
df_age_gender_ndf[['age_category', 'gender']] = pd.NA
df_age_gender_ndf = df_age_gender_ndf.reset_index().drop('index', axis=1)
df_age_gender_ndf

# Then, merge with all the relevant data, gendered
df_part_complete = pd.concat([df_age_gender_ndf, df_part_gend], 
                             axis = 0, 
                             ignore_index=True)
df_part_complete = df_part_complete.drop('State', axis=1)
df_part_complete = (df_part_complete[df_part_complete['statistic'] != 'Total participants']
                     .reset_index()
                     .drop('index', axis=1))

# no short code to this but we're basically filtering data
df_eligible_participants = (df_part_complete[df_part_complete['statistic'] == 'Eligible participants']
                            .reset_index()
                            .drop('index', axis=1)
                            .drop('statistic', axis=1)
                            .rename(columns = {'quantity':'eligible_participants'}))

# no short code to this but we're basically filtering data
df_participation_rate = (df_part_complete[df_part_complete['statistic'] == 'Participation rate (%)']
                            .reset_index()
                            .drop('index', axis=1)
                            .drop('statistic', axis=1)
                            .rename(columns = {'quantity':'Participation rate (%)'}))

# concatenate
df_normal_participation = pd.concat([df_eligible_participants, df_participation_rate['Participation rate (%)']],
                                    axis = 1)

# remove all unwanted data
Participations = (df_normal_participation[~df_normal_participation['age_category']
                                                   .isin(['Total Males', 'Total Females'])]
                           .reset_index()
                           .drop('index', axis=1)
                           .fillna('ndf')
                           .rename(columns={'area':'division', 
                                            'Participation rate (%)':'participation_rate'})
                          )
Participations['eligible_participants'] = Participations['eligible_participants'].astype(int) 

In [6]:
States

Unnamed: 0,state
0,New South Wales
1,Victoria
2,Queensland
3,Western Australia
4,South Australia
5,Tasmania
6,Northern Territory
7,Australian Capital Territory


In [7]:
Division

Unnamed: 0,div_name,approval_pct,vote_qty,state
0,Banks,44.9,84079,New South Wales
1,Barton,43.6,85137,New South Wales
2,Bennelong,49.8,86158,New South Wales
3,Berowra,54.6,88840,New South Wales
4,Blaxland,26.1,78332,New South Wales
...,...,...,...,...
145,Lyons,58.7,61152,Tasmania
146,Lingiari,54.5,34924,Northern Territory
147,Solomon,65.3,45452,Northern Territory
148,Canberra,74.1,120951,Australian Capital Territory


In [8]:
Participations

Unnamed: 0,division,age_category,eligible_participants,gender,participation_rate
0,Banks,ndf,572,ndf,83.9
1,Barton,ndf,465,ndf,85.2
2,Bennelong,ndf,571,ndf,84.1
3,Berowra,ndf,703,ndf,85.5
4,Blaxland,ndf,285,ndf,81.1
...,...,...,...,...,...
4645,Fenner,65-69 years,3544,male,89.3
4646,Fenner,70-74 years,2749,male,90.9
4647,Fenner,75-79 years,1698,male,90.8
4648,Fenner,80-84 years,1126,male,87.9


In [10]:
engine = sqlalchemy.create_engine("mssql+pyodbc://@localhost/australia_same_sex_marriage_statistics?driver=ODBC Driver 17 for SQL Server")
Participations.to_sql('participations', engine, index=False)

41

In [11]:
States.to_sql('states', engine, index=False)
Division.to_sql('division', engine, index=False)

150