## Census case study
- Preparing SQLAlchemy and the Database
- Loading Data int the Database
- Solving Data Science Problems with Queries

### Part 1: Preparing SQLAlchemy and the Database
- Create an Engine and MetaData object

In [1]:
from sqlalchemy import create_engine, MetaData
engine = create_engine('sqlite:///census.sqlite')
metadata = MetaData()

- Create and save the census table

In [2]:
from sqlalchemy import (Table, Column, String, Integer, Float, Boolean)

# employees = Table('employees', metadata,
#                  Column('id', Integer()),
#                  Column('name', String(225)),
#                  Column('salary', Decimal()),
#                  Column('active', Boolean()))

# metadata.create_all(engine)

In [3]:
# Import create_engine, MetaData
from sqlalchemy import create_engine, MetaData

# Define an engine to connect to chapter5.sqlite: engine
engine = create_engine('sqlite:///chapter5.sqlite')

# Initialize MetaData: metadata
metadata = MetaData()

In [4]:
# Import Table, Column, String, and Integer
from sqlalchemy import (Table, Column, String, Integer)

# Build a census table: census
census = Table('census', metadata,
               Column('state', String(30)),
               Column('sex', String(1)),
               Column('age', Integer()),
               Column('pop2000', Integer()),
               Column('pop2008', Integer()))

# Create the table in the database
metadata.create_all(engine)

### Populating the Database

In [7]:
import csv
values_list =[]

with open('census.csv', newline='\n') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',')
    for row in csv_reader:
        data = {'state': row[0], 'sex': row[1], 
                'age': row[2], 'pop2000': row[3],
               'pop2008': row[4]}
        values_list.append(data)

values_list[:5]

[{'state': 'Illinois',
  'sex': 'M',
  'age': '0',
  'pop2000': '89600',
  'pop2008': '95012'},
 {'state': 'Illinois',
  'sex': 'M',
  'age': '1',
  'pop2000': '88445',
  'pop2008': '91829'},
 {'state': 'Illinois',
  'sex': 'M',
  'age': '2',
  'pop2000': '88729',
  'pop2008': '89547'},
 {'state': 'Illinois',
  'sex': 'M',
  'age': '3',
  'pop2000': '88868',
  'pop2008': '90037'},
 {'state': 'Illinois',
  'sex': 'M',
  'age': '4',
  'pop2000': '91947',
  'pop2008': '91111'}]

In [9]:
from sqlalchemy import insert
connection = engine.connect()

stmt = insert(census)

result_proxy = connection.execute(stmt, values_list)

print(result_proxy.rowcount)

8772


### Example Queries

#### Part 3: Answering Data Science Questions with Queries
- Determine Average Age for Males and Females

In [10]:
from sqlalchemy import select, func

stmt = select([census.columns.sex, 
              (func.sum(census.columns.pop2008 *
                       census.columns.age) /
              func.sum(census.columns.pop2008)).label('average_age')])

stmt = stmt.group_by(census.columns.sex)

results = connection.execute(stmt).fetchall()
print(results)

[('F', 38), ('M', 35)]


In [12]:
from sqlalchemy import case, cast

stmt = select([
        (func.sum(
        case([
            (census.columns.state == 'New York',
            census.columns.pop2008)], else_ = 0)) /
         cast(func.sum(census.columns.pop2008), 
             Float) * 100).label('ny_percent')
])

results = connection.execute(stmt).fetchall()
print(results)

[(6.426761976501632,)]


- Determine the top 5 states by population change from 2000 to 2008

In [13]:
stmt = select([census.columns.age, 
              (census.columns.pop2008-
              census.columns.pop2000).label('pop_change')])

stmt = stmt.order_by('pop_change')
stmt = stmt.limit(5)

results = connection.execute(stmt).fetchall()
print(results)

[(9, -42870), (9, -40221), (10, -39018), (8, -37828), (8, -37154)]


In [19]:
# Import select
from sqlalchemy import select

# Calculate weighted average age: stmt
stmt = select([census.columns.sex,
               (func.sum(census.columns.pop2008 * census.columns.age) /
                func.sum(census.columns.pop2008)).label('average_age')
               ])

# Group by sex
stmt = stmt.group_by(census.columns.sex)

# Execute the query and store the results: results
results = connection.execute(stmt).fetchall()

# Print the average age by sex
for result in results:
    print(result.sex, result.average_age)

F 38
M 35


In [20]:
# import case, cast and Float from sqlalchemy
from sqlalchemy import case, cast, Float

# Build a query to calculate the percentage of females in 2000: stmt
stmt = select([census.columns.state,
    (func.sum(
        case([
            (census.columns.sex == 'F', census.columns.pop2000)
        ], else_=0)) /
     cast(func.sum(census.columns.pop2000), Float) * 100).label('percent_female')
])

# Group By state
stmt = stmt.group_by(census.columns.state)

# Execute the query and store the results: results
results = connection.execute(stmt).fetchall()

# Print the percentage
for result in results:
    print(result.state, result.percent_female)

Alabama 51.832407770179465
Alaska 49.301497893484594
Arizona 50.22361303057914
Arkansas 51.26992846221834
California 50.35233214901979
Colorado 49.84767060299562
Connecticut 51.66816507130644
Delaware 51.61109733558627
District of Columbia 53.129626141738385
Florida 51.36488001165242
Georgia 51.11408350339436
Hawaii 51.118011836915514
Idaho 49.98972623903102
Illinois 51.11224234802867
Indiana 50.95480313297678
Iowa 50.950398342534264
Kansas 50.821864107754735
Kentucky 51.32687036927168
Louisiana 51.75351596554121
Maine 51.50570813418951
Maryland 51.93575549972231
Massachusetts 51.843023571316785
Michigan 50.97246518318712
Minnesota 50.49332944301148
Mississippi 51.92229481794672
Missouri 51.46888602639692
Montana 50.32202690728538
Nebraska 50.8584549336086
Nevada 49.36736361384359
New Hampshire 50.858019844961746
New Jersey 51.51713956125773
New Mexico 51.0471720798335
New York 51.83453865150073
North Carolina 51.482262322084594
North Dakota 50.50069363231332
Ohio 51.46550350015544
Okl

In [24]:
from sqlalchemy import desc

In [25]:
# Build query to return state name and population difference from 2008 to 2000
stmt = select([census.columns.state,
     (census.columns.pop2008 - census.columns.pop2000).label('pop_change')
])

# Group by State
stmt = stmt.group_by(census.columns.state)

# Order by Population Change
stmt = stmt.order_by(desc('pop_change'))

# Limit to top 10
stmt = stmt.limit(10)

# Use connection to execute the statement and fetch all results
results = connection.execute(stmt).fetchall()

# Print the state and population change for each record
for result in results:
    print('{}:{}'.format(result.state, result.pop_change))

California:105705
Florida:100984
Texas:51901
New York:47098
Pennsylvania:42387
Arizona:29509
Ohio:29392
Illinois:26221
Michigan:25126
North Carolina:24108
