In [1]:
import sqlite3
import pandas as pd

In [2]:
conn = sqlite3.connect('jobs.db')

In [3]:
# A SQL query that displays all of the columns and the first five rows of the recent_grads table.
pd.read_sql_query("SELECT * FROM recent_grads LIMIT 5;", conn)

Unnamed: 0,index,Rank,Major_code,Major,Major_category,Total,Sample_size,Men,Women,ShareWomen,...,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,0,1,2419,PETROLEUM ENGINEERING,Engineering,2339,36,2057,282,0.120564,...,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,1,2,2416,MINING AND MINERAL ENGINEERING,Engineering,756,7,679,77,0.101852,...,170,388,85,0.117241,75000,55000,90000,350,257,50
2,2,3,2415,METALLURGICAL ENGINEERING,Engineering,856,3,725,131,0.153037,...,133,340,16,0.024096,73000,50000,105000,456,176,0
3,3,4,2417,NAVAL ARCHITECTURE AND MARINE ENGINEERING,Engineering,1258,16,1123,135,0.107313,...,150,692,40,0.050125,70000,43000,80000,529,102,0
4,4,5,2405,CHEMICAL ENGINEERING,Engineering,32260,289,21239,11021,0.341631,...,5180,16697,1672,0.061098,65000,50000,75000,18314,4440,972


In [4]:
# This will give us the total number of employed graduates for each major category. 
pd.read_sql_query("SELECT SUM(Employed) FROM recent_grads GROUP BY Major_category;", conn)

Unnamed: 0,SUM(Employed)
0,66943
1,288114
2,302797
3,1088742
4,330660
5,237894
6,479839
7,420372
8,372147
9,544118


In [5]:
# This makes the output much easier to understand.
pd.read_sql_query("SELECT Major_category,SUM(Employed) FROM recent_grads GROUP BY Major_category;", conn)

Unnamed: 0,Major_category,SUM(Employed)
0,Agriculture & Natural Resources,66943
1,Arts,288114
2,Biology & Life Science,302797
3,Business,1088742
4,Communications & Journalism,330660
5,Computers & Mathematics,237894
6,Education,479839
7,Engineering,420372
8,Health,372147
9,Humanities & Liberal Arts,544118


In [6]:
pd.read_sql_query("SELECT Major_category,AVG(ShareWomen) FROM recent_grads GROUP BY Major_category;", conn)

Unnamed: 0,Major_category,AVG(ShareWomen)
0,Agriculture & Natural Resources,0.617938
1,Arts,0.561851
2,Biology & Life Science,0.584518
3,Business,0.405063
4,Communications & Journalism,0.643835
5,Computers & Mathematics,0.512752
6,Education,0.674986
7,Engineering,0.257158
8,Health,0.616857
9,Humanities & Liberal Arts,0.676193


In [7]:
pd.read_sql_query("SELECT AVG(ShareWomen) AS average_female_share FROM recent_grads;", conn)

Unnamed: 0,average_female_share
0,0.52255


In [8]:
pd.read_sql_query("SELECT SUM(Men) AS total_men,SUM(Women) AS total_women FROM recent_grads;", conn)

Unnamed: 0,total_men,total_women
0,2878263,3897752


In [9]:
# For each major category, find the percentage of graduates who are employed.
pd.read_sql_query("SELECT Major_category,SUM(Employed),SUM(Total),AVG(Employed),AVG(Total),\
AVG(Employed)/AVG(Total) AS share_employed \
FROM recent_grads \
GROUP BY Major_category;", conn)

Unnamed: 0,Major_category,SUM(Employed),SUM(Total),AVG(Employed),AVG(Total),share_employed
0,Agriculture & Natural Resources,66943,79981,6694.3,7998.1,0.836986
1,Arts,288114,357130,36014.25,44641.25,0.806748
2,Biology & Life Science,302797,453862,21628.357143,32418.714286,0.667157
3,Business,1088742,1302376,83749.384615,100182.769231,0.835966
4,Communications & Journalism,330660,392601,82665.0,98150.25,0.842229
5,Computers & Mathematics,237894,299008,21626.727273,27182.545455,0.795611
6,Education,479839,559129,29989.9375,34945.5625,0.85819
7,Engineering,420372,537583,14495.586207,18537.344828,0.781967
8,Health,372147,463230,31012.25,38602.5,0.803374
9,Humanities & Liberal Arts,544118,713468,36274.533333,47564.533333,0.762638


When we want to filter on a column generated by a query, we can use the **HAVING** statement.

In [10]:
pd.read_sql_query("SELECT Major_category,AVG(Employed)/AVG(Total) AS share_employed \
FROM recent_grads \
GROUP BY Major_category \
HAVING share_employed>.8;", conn)

Unnamed: 0,Major_category,share_employed
0,Agriculture & Natural Resources,0.836986
1,Arts,0.806748
2,Business,0.835966
3,Communications & Journalism,0.842229
4,Education,0.85819
5,Health,0.803374
6,Industrial Arts & Consumer Services,0.82267
7,Law & Public Policy,0.808399


In [11]:
# Find all of the major categories where the share of graduates with low-wage jobs is greater than .1.
pd.read_sql_query("SELECT Major_category,AVG(Low_wage_jobs)/AVG(Total) AS share_low_wage \
FROM recent_grads \
GROUP BY Major_category \
HAVING share_low_wage>.1;", conn)

Unnamed: 0,Major_category,share_low_wage
0,Arts,0.168331
1,Communications & Journalism,0.126324
2,Humanities & Liberal Arts,0.132087
3,Industrial Arts & Consumer Services,0.115713
4,Law & Public Policy,0.115685
5,Psychology & Social Work,0.116934
6,Social Science,0.102233


In [12]:
# The query will round the ShareWomen column to two decimal places.
pd.read_sql_query("SELECT Major_category,ROUND(ShareWomen,2) AS rounded_share_women \
FROM recent_grads \
GROUP BY Major_category;", conn)

Unnamed: 0,Major_category,rounded_share_women
0,Agriculture & Natural Resources,0.72
1,Arts,0.63
2,Biology & Life Science,0.64
3,Business,0.73
4,Communications & Journalism,0.88
5,Computers & Mathematics,0.86
6,Education,0.88
7,Engineering,0.38
8,Health,0.71
9,Humanities & Liberal Arts,0.67


In [13]:
pd.read_sql_query("SELECT ROUND(ShareWomen,4),Major_category FROM recent_grads LIMIT 10;", conn)

Unnamed: 0,"ROUND(ShareWomen,4)",Major_category
0,0.1206,Engineering
1,0.1019,Engineering
2,0.153,Engineering
3,0.1073,Engineering
4,0.3416,Engineering
5,0.145,Engineering
6,0.5357,Business
7,0.4414,Physical Sciences
8,0.1398,Engineering
9,0.4378,Engineering


In [14]:
pd.read_sql_query("SELECT Major_category,ROUND(AVG(Employed)/AVG(Total),3) AS share_employed \
FROM recent_grads \
GROUP BY Major_category \
HAVING share_employed>.8;", conn)

Unnamed: 0,Major_category,share_employed
0,Agriculture & Natural Resources,0.837
1,Arts,0.807
2,Business,0.836
3,Communications & Journalism,0.842
4,Education,0.858
5,Health,0.803
6,Industrial Arts & Consumer Services,0.823
7,Law & Public Policy,0.808


In [15]:
pd.read_sql_query("SELECT Major_category,ROUND(AVG(College_jobs)/AVG(Total),3) AS share_degree_jobs \
FROM recent_grads \
GROUP BY Major_category \
HAVING share_degree_jobs<.3;", conn)

Unnamed: 0,Major_category,share_degree_jobs
0,Agriculture & Natural Resources,0.248
1,Arts,0.265
2,Business,0.114
3,Communications & Journalism,0.22
4,Humanities & Liberal Arts,0.27
5,Industrial Arts & Consumer Services,0.249
6,Law & Public Policy,0.163
7,Social Science,0.215
