In [19]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import ipynb.fs.full.project_functions as prf

In [2]:
engine = create_engine('postgresql:///opportunity_youth') # since database is owned by me, no other credentials are needed

### these are the query strings used to retrive the data from postgres

this inlcueds the strings and the items that we queried inorder to retrive the data

In [3]:
str_all = """
SELECT PWGTP, agep, SCHG, SCH, SCHL, ESR
FROM pums_2017
WHERE (puma BETWEEN '11601' AND '11616')
AND (agep BETWEEN 16 AND 24)
"""
### this string will search the opportunity_youth data base for all individuals between the ages of 16 and 24

In [4]:
str_south = """
SELECT PWGTP, agep, SCHG, SCH, SCHL, ESR
FROM pums_2017
WHERE (puma BETWEEN '11610' AND '11615')
AND (agep BETWEEN 16 AND 24)
"""
### this string will search the opportunity_youth data base for all individuals between the ages of 16 and 24

In [5]:
str_yo = """
SELECT PWGTP, agep, SCHG, SCH, SCHL
FROM pums_2017
WHERE (puma BETWEEN '11601' AND '11616')
AND (agep BETWEEN 16 AND 24)
AND (ESR = '3' OR ESR = '6')
AND (SCH = '1')
"""
### this string will search the opportunity_youth data base for all the youth opportunity in king county

In [7]:
str_south_yo = """
SELECT PWGTP, agep, SCHG, SCH, SCHL
FROM pums_2017
WHERE (puma BETWEEN '11610' AND '11615')
AND (agep BETWEEN 16 AND 24)
AND (ESR = '3' OR ESR = '6')
AND (SCH = '1')
"""

In [6]:
str_work_no_dimploma = """
SELECT PWGTP, agep, SCHG, SCH, SCHL, ESR
FROM pums_2017
WHERE (puma BETWEEN '11610' AND '11615')
AND (agep BETWEEN 16 AND 24)
AND (ESR != '3' AND ESR != '6')
AND (SCHL <= '15')
"""

In [8]:
df_s = pd.read_sql(sql = str_south, con = engine) #data frame for all people between the ages of 16-24 in South King County
df_s_yo = pd.read_sql(sql = str_south_yo, con = engine) #data frame for all youth opportunity in south King County
df_w_noDiploma = pd.read_sql(sql = str_work_no_dimploma, con = engine) #data frame for all people with no diploma between the ages of 16 and 24 that are working

### functions used for project

In [9]:
def get_average(data, col_name, age1 = 16, age2 = 24):
    try: 
        filtered_data = data[(data.agep >= age1) & (data.agep <= age2)]
        filtered_data['weighted_sum'] = filtered_data[col_name]*filtered_data['pwgtp']
        sum_of_weighted_data = filtered_data['weighted_sum'].sum()
        sum_of_weights = filtered_data['pwgtp'].sum()
        return sum_of_weighted_data/sum_of_weights
    except:
        print("operation could not be completed")


In [10]:
def weight_sum(df):
    weighted_sum = df['pwgtp'].sum()
    return weighted_sum
                      

In [11]:
def trisect_ages(df):
    df_16_18 = df[(df.agep>=16) & (df.agep <= 18)]
    value_16_18 = weight_sum(df_16_18)
    
    df_19_21 = df[(df.agep>=19) & (df.agep <= 21)]
    value_19_21 = weight_sum(df_19_21)
    
    df_22_24 = df[(df.agep>=22) & (df.agep <= 24)]
    value_22_24 = weight_sum(df_22_24)
    
    value_total = weight_sum(df)
    
    value_array = np.array([value_16_18, value_19_21, value_22_24, value_total])
    return value_array

In [12]:
def form_2d_array(df_list):
    new_list = np.zeros(len(df_list)+1)
    df_array = np.array(df_list)
    df_table = np.array([trisect_ages(x_df) for x_df in df_array])
    
    for x in range(len(df_list)+1):
        element = df_table[0][x]-(df_table[1][x]+df_table[2][x])
        new_list[x] = element
    final_table = np.append(df_table, [new_list], axis = 0)
    
    return final_table
    

In [13]:
def create_df(array_entry, columns_entry, index_entry):
    df = pd.DataFrame(array_entry, columns = columns_entry, index = index_entry)
    return df
    

### This will create an array using the data

the first_array function takes in a list of dataframes as an arguement and divides the population up into three rows, the first column are people who are 16-18 in age, the second column represents people who are 19-21, the third column represents people who are 22-24 and the fourth column are the toal number of people. The first row represents the total as well, while the second row represents the people who qualify for opportunity youth, the third row are people who don't have a degree but are working, and the last row are the people who don't qualify for opportunity youth.

In [16]:
first_array = form_2d_array([df_s, df_s_yo, df_w_noDiploma])
first_array

array([[30141., 25486., 30256., 85883.],
       [ 1815.,  3902.,  4897., 10614.],
       [ 4377.,  1630.,  1705.,  7712.],
       [23949., 19954., 23654., 67557.]])

This will create the column names and the row names that will later be used for creating the data

In [9]:
column_names = ['16-18 total', '19-21 total', '22-24 total', '16-24 total']
index_names = ["Total Population", "Opportunity Youth", "Working no Dimploma", "Not Opportunity Youth"]

### Creating the Pandas table

In [256]:
df_first_table = pd.DataFrame(first_array, columns = ['16-18 total', '19-21 total', '22-24 total', '16-24 total'], index = ["Total Population", "Opportunity Youth", "Working no Dimploma", "Not Opportunity Youth"])
df_first_table

Unnamed: 0,16-18 total,19-21 total,22-24 total,16-24 total
Total Population,30141.0,25486.0,30256.0,85883.0
Opportunity Youth,1815.0,3902.0,4897.0,10614.0
Working no Dimploma,4377.0,1630.0,1705.0,7712.0
Not Opportunity Youth,23949.0,19954.0,23654.0,67557.0


### Cleaning Data

though the data doesn't need much cleaning, to get the look of the original table, several new columns were required to represent the percentages of each group

In [267]:
reorganized_list = ['16-18 percentage','16-18 total', '19-21 percentage', '19-21 total', '22-24 percentage', '22-24 total', '16-24 percentage', '16-24 total', ]

In [268]:
df_first_table = df_first_table.reindex(columns = reorganized_list)
df_first_table


Unnamed: 0,16-18 percentage,16-18 total,19-21 percentage,19-21 total,22-24 percentage,22-24 total,16-24 percentage,16-24 total
Total Population,100.0,30141.0,100.0,25486.0,100.0,30256.0,100.0,85883.0
Opportunity Youth,6.0,1815.0,15.0,3902.0,16.0,4897.0,12.0,10614.0
Working no Dimploma,15.0,4377.0,6.0,1630.0,6.0,1705.0,9.0,7712.0
Not Opportunity Youth,79.0,23949.0,78.0,19954.0,78.0,23654.0,79.0,67557.0


### Garbage work for testing out additional stuff not relevant for table


In [270]:
df_array = np.array([df_s, df_yo_south, df_w_noDiploma])
df_table = np.array([trisect_ages(df) for df in df_array])

print(df_table)

new_list = np.array([0, 0, 0, 0])
for x in range(4):
    element = df_table[0][x]-(df_table[1][x]+df_table[2][x])
    new_list[x] = element
print(new_list)
np.append(df_table, [new_list], axis = 0)

[[30141. 25486. 30256. 85883.]
 [ 1815.  3902.  4897. 10614.]
 [ 4377.  1630.  1705.  7712.]]
[23949 19954 23654 67557]


array([[30141., 25486., 30256., 85883.],
       [ 1815.,  3902.,  4897., 10614.],
       [ 4377.,  1630.,  1705.,  7712.],
       [23949., 19954., 23654., 67557.]])

In [44]:
filtered_data = df_all[(df_all.agep >= 16) & (df_all.agep <= 18)]

In [45]:
filtered_data.head()

Unnamed: 0,puma,pwgtp,agep,schg,sch,schl,esr
4,11613,15.0,19.0,,1,13,6
6,11606,20.0,16.0,12.0,2,12,6
7,11606,15.0,19.0,,1,19,1
8,11611,28.0,18.0,15.0,2,19,3
10,11610,45.0,18.0,14.0,2,16,1


In [51]:
filtered_data['weighted_sum'] = filtered_data['agep']*filtered_data['pwgtp']
weighted_sum = filtered_data['weighted_sum'].sum()
sum_weights = filtered_data['pwgtp'].sum()
average_weight = weighted_sum/sum_weights
print(average_weight)



17.472792375335686


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [31]:
df_16_18 = df_all[(df_all['agep']>= 16.0) & (df_all['agep']<= 18.0)]
total_number_16_18 = df_16_18['pwgtp'].sum()
total_number_16_18

72945.0

In [10]:
df_yo= pd.read_sql(sql = search_string_yo, con = engine)

In [18]:
df_all.head()
df_all.shape

(10053, 7)