In [1]:
 # Python extension for interfacing with SQL and better table formatting with Pandas

# !pip install ipython-sql
# !pip install pandas

Our data for this project is initially in 2 CSV files. Lets investigate how these files look like first.

In [10]:
# Lets just load the extensions that we need to begin a separate cell
import pandas as pd
import sqlite3

In [11]:
# Getting a view of the census csv table
census_df = pd.read_csv('census_data.csv')
print(census_df.head())
print(census_df.info())

   zip_code state_code  pop_total  median_household_income  pct_edu_hs  \
0       601         PR      17800                  11507.0     24.1861   
1       602         PR      39716                  15511.0     18.3838   
2       603         PR      51565                  16681.0     23.6281   
3       606         PR       6320                  11648.0     32.7551   
4       610         PR      27976                  17751.0     28.7571   

   pct_edu_somecollege_under1yr  pct_edu_somecollege_1plusyrs  \
0                        1.2198                       12.9974   
1                        1.0237                        8.3569   
2                        1.6381                       10.2082   
3                        0.7076                        4.0402   
4                        1.3412                        9.2270   

   pct_edu_attain_assoc  pct_edu_attain_bach  pct_edu_attain_master  \
0                5.0812              15.5212                 2.3387   
1               13.392

Seems like this table contains information on different areas and some info on the people within them, mainly percentages of education and race. \
The zip_code should be the primary key for this table as well (zip_codes are unique), we will check this after we make our SQL database later however.

In [12]:
# Getting a view of the high school csv table
public_hs_df = pd.read_csv('public_hs_data.csv')
print(public_hs_df.head())
print(public_hs_df.info())

     school_id                school_name        street_address          city  \
0  10000500871       Albertville High Sch      402 E McCord Ave   Albertville   
1  10000600878           Douglas High Sch       225 Eagle Drive       Douglas   
2  10000600883  Kate D Smith DAR High Sch          6077 Main St         Grant   
3  10000601585       Brindlee Mt High Sch   994 Scant City Road  Guntersville   
4  10000700251            Hoover High Sch  1000 Buccaneer Drive        Hoover   

  state_code  zip_code  locale_code  pct_proficient_math  \
0         AL     35950         32.0            16.999999   
1         AL     35964         42.0            12.000000   
2         AL     35747         42.0            22.000000   
3         AL     35976         41.0            21.999999   
4         AL     35244         13.0            39.999998   

   pct_proficient_reading  pct_white  pct_black  pct_asian  pct_hispanic  \
0               54.000001    64.9018     2.0495     0.5978       31.4261   


Seems like this table contains information on the different public high schools including; location information, race percentages, proficiency percentages for math and reading.
One interesting thing to note is that locale code is actually related to the urbanization of the town, not really relating to the school itself directly. This is mentioned in the project instructions which gives the following table:

| locale_text | locale_code (locale_size)         |
|-------------|----------------------------------|
| City        | 11 (Large), 12 (Midsize), 13 (Small) |
| Suburb      | 21 (Large), 22 (Midsize), 23 (Small) |
| Town        | 31 (Fringe), 32 (Distant), 33 (Remote) |
| Rural       | 41 (Fringe), 42 (Distant), 43 (Remote) | \

The school_id should be the primary key of this table as it seems like it will be unique values. The zip_code would then be the foreign key to the previous table assuming we were correct earlier.

In [13]:
# Combining tables and maing SQL database

# Make the SQLite database
conn = sqlite3.connect('education.db')

# Write DataFrames to SQLite tables in the database (don't need the dataframe indexes)
public_hs_df.to_sql('highschool', conn, if_exists='replace', index=False)
census_df.to_sql('census', conn, if_exists='replace', index=False)

# Close the connection
conn.close()

In [1]:
# Necessary in the Jupyter Notebook to load the SQL extension and connect to the database file to use SQL directly, currently using SQLite
# Formatting the SQL query outputs into a better format with Pandas

%load_ext sql
%sql sqlite:///education.db
%config SqlMagic.autopandas=True

Lets now check about our unique columns from earlier to make sure they can correctly be assumed to be the primary/foreign keys.

In [10]:
%%sql
SELECT
    COUNT(zip_code) AS total_values_census,
    COUNT(DISTINCT zip_code) AS unique_values_census
FROM
    census;

 * sqlite:///education.db
Done.


Unnamed: 0,total_values_census,unique_values_census
0,33120,33120


In [11]:
%%sql
SELECT
    COUNT(school_id) AS total_values_highschool,
    COUNT(DISTINCT school_id) AS unique_values_highschool
FROM
    highschool;

 * sqlite:///education.db
Done.


Unnamed: 0,total_values_highschool,unique_values_highschool
0,16623,16623


Looks like these columns can work as our keys. We won't be adjusting the tables since SQLite does not directly support this function. We could create a temp table and move around the data but this is unecessary to do. 

In [23]:
%%sql
PRAGMA table_info(census);

 * sqlite:///education.db
Done.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,zip_code,INTEGER,0,,0
1,1,state_code,TEXT,0,,0
2,2,pop_total,INTEGER,0,,0
3,3,median_household_income,REAL,0,,0
4,4,pct_edu_hs,REAL,0,,0
5,5,pct_edu_somecollege_under1yr,REAL,0,,0
6,6,pct_edu_somecollege_1plusyrs,REAL,0,,0
7,7,pct_edu_attain_assoc,REAL,0,,0
8,8,pct_edu_attain_bach,REAL,0,,0
9,9,pct_edu_attain_master,REAL,0,,0


In [24]:
%%sql
PRAGMA table_info(highschool);

 * sqlite:///education.db
Done.


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,school_id,INTEGER,0,,0
1,1,school_name,TEXT,0,,0
2,2,street_address,TEXT,0,,0
3,3,city,TEXT,0,,0
4,4,state_code,TEXT,0,,0
5,5,zip_code,INTEGER,0,,0
6,6,locale_code,REAL,0,,0
7,7,pct_proficient_math,REAL,0,,0
8,8,pct_proficient_reading,REAL,0,,0
9,9,pct_white,REAL,0,,0


There is also no reason to change any of the other constraints about the columns as we will likely not be adding to these columns.