In [None]:
# These exercises are from https://www.pgexercises.com/
# Practice on these exercises using both SQL and pandas

In [4]:
import pandas as pd
import numpy as np
import psycopg2
import datetime
import time


Open SQL to get tables

In [5]:
#Open Session

conn = psycopg2.connect("dbname=exercises user=postgres")
cur = conn.cursor()


In [6]:
# Check the relations in the db

cur.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';")
print cur.fetchall()

[('facilities',), ('bookings',), ('members',)]


In [7]:
# Test Query

query = '''SELECT * FROM cd.facilities;'''
cur.execute(query)
cur.fetchone()

(0,
 'Tennis Court 1',
 Decimal('5'),
 Decimal('25'),
 Decimal('10000'),
 Decimal('200'))

In [11]:
# Read Tables into Pandas

qry = 'Select * from cd.members;'
members_df = pd.read_sql(qry, conn)

qry = 'Select * from cd.facilities;'
facilities_df = pd.read_sql(qry, conn)

qry = 'Select * from cd.bookings;'
bookings_db = pd.read_sql(qry, conn)


In [12]:
facilities_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 6 columns):
facid                 9 non-null int64
name                  9 non-null object
membercost            9 non-null float64
guestcost             9 non-null float64
initialoutlay         9 non-null float64
monthlymaintenance    9 non-null float64
dtypes: float64(4), int64(1), object(1)
memory usage: 504.0+ bytes


In [10]:
conn.rollback()

In [None]:
conn.close()

Time for Pandas Fun!!!!

In [None]:
# BASICS - Exercise #1: How can you retrieve all the information from the cd.facilities table?

facilities_df

# SQL Version:   
# SELECT * FROM cd.facilities;

In [None]:
# BASICS - Exercise #2: You want to print out a list of all of the facilities and their cost to members. 
#                       How would you retrieve a list of only facility names and costs?


facilities_df[['name', 'membercost']] #.to_string(index=False)

# SQL Version:   
# SELECT facilities.name, facilities.membercost FROM cd.facilities;

In [None]:
# BASICS - Exercise #3:  How can you produce a list of facilities that charge a fee to members?

facilities_df[facilities_df['membercost'] > 0]

# SQL Version:   
# SELECT * FROM cd.facilities
# WHERE membercost > 0;

In [None]:
# BASICS - Exercise #4: How can you produce a list of facilities that charge a fee to members, 
#                       and that fee is less than 1/50th of the monthly maintenance cost? 
#                       Return the facid, facility name, member cost, and monthly maintenance 
#                       of the facilities in question.


facilities_df[(facilities_df['membercost'] > 0) & (facilities_df['membercost'] < facilities_df['monthlymaintenance']/50)]\
[['facid', 'name', 'membercost', 'monthlymaintenance']]

# SQL Version:   
# SELECT facid, name, membercost, monthlymaintenance 
#         FROM cd.facilities
#         WHERE membercost > 0 AND 
#               membercost < monthlymaintenance/50;

In [None]:
# BASICS - Exercise #5:  How can you produce a list of all facilities with the word 'Tennis' in their name?


# Three Pandas methods, two use apply, the other uses str.contains:
# facilities_df[facilities_df.apply(lambda x: "Tennis" in x['name'], axis=1)]

def find_tennis(data):
    return "Tennis" in data['name']

facilities_df[facilities_df.apply(find_tennis, axis=1)]

# facilities_df[facilities_df['name'].str.contains('Tennis')]


# SQL Version:   
# SELECT * 
#      FROM cd.facilities
#      WHERE name LIKE '%Tennis%';


In [None]:
# BASICS - Exercise #6: How can you retrieve the details of facilities with ID 1 and 5? 
#                       Try to do it without using the OR operator.


# Pandas slicing. So fun! the slicing format is [[firstrow:lastrow:order],[firstcolumn:lastcolumn:order]] 

# facilities_df.iloc[[1,5],:]  # Note: this works if the facid is the index, otherwise, use:

facilities_df[facilities_df.apply(lambda x: (x['facid']==1) or (x['facid']==5), axis=1)]


# SQL Version:   
#    SELECT * 
#    FROM cd.facilities
#        WHERE 
#              facid IN (1,5);


In [None]:
# BASICS - Exercise #7: How can you produce a list of facilities, 
#                       with each labelled as 'cheap' or 'expensive' depending on if their 
#                       monthly maintenance cost is more than $100? 
#                       Return the name and monthly maintenance of the facilities in question.


# Two different approaches; 4 different solutions
# 3 use a(n anonymous) function and .apply
# the other uses np.where
# def cheap_expensive(x): 
#     if (x < 100):
#         return "cheap"
#     else: return 'expensive'

# facilities_df['cost'] = facilities_df['monthlymaintenance'].apply(cheap_expensive)


# facilities_df['cost'] = facilities_df['monthlymaintenance'].apply(lambda x: 'cheap' if x<100 else 'expensive')
facilities_df['cost'] = facilities_df.apply(lambda x: 'cheap' if x['monthlymaintenance']<100 else 'expensive', axis=1)

# facilities_df["cost"] = np.where(facilities_df["monthlymaintenance"] < 100, "Cheap", "Expensive")

facilities_df[['name', 'cost']]


# SQL Version: 
# SELECT facilities.name, 
# 	CASE 
#     	WHEN facilities.monthlymaintenance > 100 THEN 'expensive'
#         ELSE 'cheap'  
#         END as cost
#     FROM cd.facilities



In [None]:
# BASICS - Exercise #8:  How can you produce a list of members who joined after the start of September 2012? 
#                        Return the memid, surname, firstname, and joindate of the members in question.


members_df[members_df['joindate'] > datetime.date(2012,9,1)][['memid', 'surname', 'firstname', 'joindate']]

# SQL Version: 
# SELECT members.memid, members.surname, members.firstname, members.joindate
#         FROM cd.members       
#         WHERE 
#             members.joindate >= '2012-09-01'

In [None]:
# BASICS - Exercise #9:  How can you produce an ordered list of the first 10 surnames in the members table?
#                        The list must not contain duplicates.


list(members_df['surname'].sort_values().unique()[:10])


# SQL Version: 
# # SELECT DISTINCT members.surname
#      FROM cd.members
#      ORDER BY  members.surname
#      LIMIT 10;

In [None]:
# BASICS - Exercise #10:  You, for some reason, want a combined list of all surnames and all facility names. 
#                         Yes, this is a contrived example :-).   Produce that list!

list(members_df['surname'].unique())+list(facilities_df['name'].unique())


# SQL Version: 

# SELECT DISTINCT members.surname
# 	FROM cd.members
	
# 	UNION
	
# SELECT DISTINCT facilities.name
# 	FROM cd.facilities


In [None]:
# BASICS - Exercise #11:  You'd like to get the signup date of your last member. How can you retrieve this information?


members_df['joindate'].max().date()

# SQL Version: 

# SELECT MAX(members.joindate) as latest
# 	FROM cd.members;

In [None]:
# BASICS - Exercise #12:   You'd like to get the first and last name of the last member(s) who signed up - 
#                          not just the date. How can you do that?

members_df[members_df['joindate']==members_df['joindate'].max()][['firstname', 'surname', 'joindate']]


# SQL Version: 
# select firstname, surname, joindate
# 	from cd.members
# 	where joindate = 
# 		(select max(joindate) 
# 			from cd.members);          
        
#         or
        
# SELECT members.firstname, members.surname, members.joindate
# 	FROM cd.members
	
# 	ORDER BY members.joindate DESC
# 	LIMIT 1;

In [None]:
# Joins and Subqueries  - Exercise #1:   How can you produce a list of the start times for
#                                        bookings by members named 'David Farrell'?

bookings_db[bookings_db['memid'] == int(members_df[(members_df['firstname']=='David') & (members_df['surname']=='Farrell')]['memid'])]['starttime']


# SQL 
# Two ways to approach this: using a subquery or by using an inner join

# SELECT bookings.starttime
# 	FROM cd.bookings
	
# 	WHERE bookings.memid = 
# 				(SELECT members.memid FROM cd.members 
# 					WHERE 
# 						members.firstname = 'David' AND 
# 						members.surname = 'Farrell');
        
        
# select bks.starttime 
# 	from 
# 		cd.bookings bks
# 		inner join cd.members mems
# 			on mems.memid = bks.memid
# 	where 
# 		mems.firstname='David' 
# 		and mems.surname='Farrell';  


# A second way to do the inner join
# 
# select bks.starttime
#         from
#                 cd.bookings bks,
#                 cd.members mems
#         where
#                 mems.firstname='David'
#                 and mems.surname='Farrell'
#                 and mems.memid = bks.memid;

In [None]:
# Joins and Subqueries  - Exercise #2:  How can you produce a list of the start times for bookings for tennis 
#                                       courts, for the date '2012-09-21'? Return a list of start time and 
#                                       facility name pairings, ordered by the time.

# # In Pandas, I can think of three approaches:
# 1) WON'T WORK -- Create the needed columns separately, then concatenate them THE COLUMNS WOULD HAVE TO BE JOINED
# 2) Use a function to map the facility name onto the booking df, then filter the bookings DF
# 3) Create an inner join, then filter against that


# # Method 2
bookings_db['name1'] = bookings_db['facid'].map( lambda x: facilities_df[facilities_df['facid'] == x]['name'].to_string(index=False))
bookings_db[ ( bookings_db['starttime'] >= datetime.date(2012, 9, 21))  &\
             ( bookings_db['starttime'] < datetime.date(2012, 9, 22) ) &\
           bookings_db['name1'].str.contains("Tennis C")][['starttime', 'name1']].sort_values('starttime')

# Method 3 

# merged = pd.merge(bookings_db, facilities_df, on='facid')
# merged[(merged['starttime'] >= datetime.date(2012,9,21)) & (merged['starttime'] < datetime.date(2012,9,22)) \
#       ][['starttime', 'name']][merged['name'].str.contains("Tennis C")].sort_values('starttime')


# SQL 
# 
# SELECT bks.starttime as start, fcs.name as name
# 	FROM 
# 		cd.bookings bks INNER JOIN cd.facilities fcs
# 		ON bks.facid = fcs.facid
# 	WHERE bks.starttime >= '2012-09-21' AND bks.starttime < '2012-09-22'
# 	AND fcs.name LIKE '%Tennis Court%'
	
#   ORDER BY bks.starttime;

In [None]:
# Joins and Subqueries  - Exercise #2:   How can you output a list of all members who have recommended
#                                        another member? Ensure that there are no duplicates in the list, 
#                                        and that results are ordered by (surname, firstname).

members_df[['surname', 'firstname']][members_df['memid'].isin(list(members_df['recommendedby'])) ].sort_values(['surname', 'firstname'])


# SQL

# SELECT DISTINCT mb2.surname, mb2.firstname
# 	FROM 
#     	cd.members mb1 INNER JOIN cd.members mb2
#         ON mb2.memid = mb1.recommendedby
#     ORDER BY mb2.surname, mb2.firstname;

In [22]:
# Joins and Subqueries  - Exercise #4: How can you output a list of all members, including the individual 
#                         who recommended them (if any)? 
#                         Ensure that results are ordered by (surname, firstname).

pd.merge(members_df, members_df, how='left', right_on='memid', left_on='recommendedby')[['firstname_x', 'surname_x', 'firstname_y', 'surname_y']]


# SQL

# SELECT mb1.firstname as memfname, mb1.surname as memsname, mb2.firstname as recfname, mb2.surname as recsname
# 	FROM 
#      	cd.members mb1 LEFT OUTER JOIN cd.members mb2
#          ON mb2.memid = mb1.recommendedby
#      ORDER BY mb1.surname, mb1.firstname;

Unnamed: 0,firstname_x,surname_x,firstname_y,surname_y
0,GUEST,GUEST,,
1,Darren,Smith,,
2,Tracy,Smith,,
3,Tim,Rownam,,
4,Janice,Joplette,Darren,Smith
5,Gerald,Butters,Darren,Smith
6,Burton,Tracy,,
7,Nancy,Dare,Janice,Joplette
8,Tim,Boothe,Tim,Rownam
9,Ponder,Stibbons,Burton,Tracy


In [25]:
members_df['combined_name'] = members_df['firstname']+' ' +members_df['surname']
members_df['combined_name']

0                 GUEST GUEST
1                Darren Smith
2                 Tracy Smith
3                  Tim Rownam
4             Janice Joplette
5              Gerald Butters
6                Burton Tracy
7                  Nancy Dare
8                  Tim Boothe
9             Ponder Stibbons
10               Charles Owen
11                David Jones
12                 Anne Baker
13             Jemima Farrell
14                 Jack Smith
15             Florence Bader
16              Timothy Baker
17               David Pinker
18            Matthew Genting
19             Anna Mackenzie
20                Joan Coplin
21           Ramnaresh Sarwin
22              Douglas Jones
23           Henrietta Rumney
24              David Farrell
25    Henry Worthington-Smyth
26          Millicent Purview
27        Hyacinth Tupperware
28                  John Hunt
29              Erica Crumpet
30               Darren Smith
Name: combined_name, dtype: object

In [93]:
merge2['starttime'][0].date()

datetime.date(2012, 7, 3)

In [38]:
# Joins and Subqueries  - Exercise #5:  How can you produce a list of all members who have used a tennis court? 
#                                       Include in your output the name of the court, and the name of the
#                                       member formatted as a single column. 
#                                       Ensure no duplicate data, and order by the member name.

merge1 = pd.merge(members_df,bookings_db, on= 'memid')
merge2 = pd.merge(merge1, facilities_df, on='facid')
np.sort(merge2[merge2['name'].str.contains('Tennis C')]['combined_name'].unique())


# SQL

# SELECT DISTINCT CONCAT(m.firstname, ' ',m.surname) as member, f.name as facility
# 	FROM 
#     	cd.members m INNER JOIN cd.bookings b
#         ON m.memid = b.memid
#         INNER JOIN cd.facilities f
#         ON f.facid = b.facid
        
#         WHERE f.name LIKE 'Tennis C%'
        
#         ORDER BY member;

array(['Anne Baker', 'Burton Tracy', 'Charles Owen', 'Darren Smith',
       'David Farrell', 'David Jones', 'David Pinker', 'Douglas Jones',
       'Erica Crumpet', 'Florence Bader', 'GUEST GUEST', 'Gerald Butters',
       'Henrietta Rumney', 'Jack Smith', 'Janice Joplette',
       'Jemima Farrell', 'Joan Coplin', 'John Hunt', 'Matthew Genting',
       'Millicent Purview', 'Nancy Dare', 'Ponder Stibbons',
       'Ramnaresh Sarwin', 'Tim Boothe', 'Tim Rownam', 'Timothy Baker',
       'Tracy Smith'], dtype=object)

In [125]:
# Joins and Subqueries  - Exercise #6: How can you produce a list of bookings on the day of 2012-09-14 
#                                      which will cost the member (or guest) 
#                                      more than $30? Remember that guests have different costs to members 
#                                      (the listed costs are per half-hour 'slot'), and the guest user
#                                      is always ID 0. Include in your output the name of the facility, 
#                                      the name of the member formatted as a single column, and the cost. 
#                                      Order by descending cost, and do not use any subqueries.


def cost(df):
    costg = df['slots']*df['guestcost']
    costm = df['slots']*df['membercost']
    if 'GUEST' in df['firstname'] and costg>30 and  (df['starttime'].date()>= datetime.date(2012,9,14) ) and ( df['starttime'].date() < datetime.date(2012,9,15) ) :
        return df['name'], df['combined_name'], costg
    elif 'GUEST' not in df['firstname'] and costm>30 and (df['starttime'].date()>= datetime.date(2012,9,14) ) and ( df['starttime'].date() < datetime.date(2012,9,15) )  :
        return df['name'], df['combined_name'], costm
    else: pass

cheap_expensive = merge2.apply(cost, axis=1).dropna()
cheap_expensive.apply(pd.Series, index=["facility", "name", "cost"]).sort_values(by='cost',ascending=False)

# lambda x:'oop' if  x['firstname'].str.contains("GUEST"), axis=1)
# merge2['firstname'].str.contains("GUEST")

# SQL 

# SELECT CONCAT(m.firstname, ' ', m.surname) as member, f.name as facility, 
# 	CASE
#     	WHEN m.memid = 0 THEN b.slots*f.guestcost
#         ELSE b.slots*f.membercost
#     END as cost
    
#     FROM cd.members m INNER JOIN cd.bookings b
#     ON m.memid = b.memid
#     INNER JOIN cd.facilities f
#     ON f.facid = b.facid
    
#     WHERE b.starttime >= '2012-09-14' AND 
#           b.starttime < '2012-09-15' AND (
#           (m.memid != 0 AND b.slots*f.membercost > 30) OR 
#           (m.memid = 0 AND b.slots*f.guestcost > 30)
#     )
#     ORDER BY cost DESC;

Unnamed: 0,facility,name,cost
2750,Massage Room 2,GUEST GUEST,320.0
594,Massage Room 1,GUEST GUEST,160.0
595,Massage Room 1,GUEST GUEST,160.0
596,Massage Room 1,GUEST GUEST,160.0
1157,Tennis Court 2,GUEST GUEST,150.0
963,Massage Room 1,Jemima Farrell,140.0
1535,Tennis Court 1,GUEST GUEST,75.0
1534,Tennis Court 1,GUEST GUEST,75.0
1156,Tennis Court 2,GUEST GUEST,75.0
1038,Massage Room 1,Matthew Genting,70.0


In [129]:
# Joins and Subqueries  - Exercise #7:    How can you output a list of all members, including the individual
#                                         who recommended them (if any), without using any joins? Ensure that 
#                                         there are no duplicates in the list, and that each firstname + surname 
#                                         pairing is formatted as a column and ordered.


result1 = pd.merge(members_df,members_df,left_on='recommendedby', right_on='memid')[['combined_name_x', 'combined_name_y']].sort_values('combined_name_x')
result1.columns = ['member', 'recommender']
result1


# SQL 

# SELECT DISTINCT CONCAT(members.firstname, ' ' , members.surname) as name,
# 		(
# 		  SELECT CONCAT(recs.firstname, ' ', recs.surname) as recommender
# 				  FROM cd.members recs
# 		  		  WHERE recs.memid = members.recommendedby)
# 		FROM cd.members
# 		ORDER BY name;

Unnamed: 0,member,recommender
4,Anna Mackenzie,Darren Smith
9,Anne Baker,Ponder Stibbons
2,Charles Owen,Darren Smith
6,David Jones,Janice Joplette
12,David Pinker,Jemima Farrell
16,Douglas Jones,David Jones
20,Erica Crumpet,Tracy Smith
10,Florence Bader,Ponder Stibbons
1,Gerald Butters,Darren Smith
17,Henrietta Rumney,Matthew Genting


In [130]:
# Joins and Subqueries  - Exercise #8: The Produce a list of costly bookings exercise contained some 
#                                      messy logic: we had to calculate the booking cost in both the 
#                                      WHERE clause and the CASE statement. Try to simplify this 
#                                      calculation using subqueries. For reference, the question was:
#  
#                                      How can you produce a list of bookings on the day of 2012-09-14 
#                                      which will cost the member (or guest) more than $30? Remember 
#                                      that guests have different costs to members (the listed costs are 
#                                      per half-hour 'slot'), and the guest user is always ID 0. Include 
#                                      in your output the name of the facility, the name of the member 
#                                      formatted as a single column, and the cost. Order by descending cost.

# for pandas, I just pasted the solution from above

def cost(df):
    costg = df['slots']*df['guestcost']
    costm = df['slots']*df['membercost']
    if 'GUEST' in df['firstname'] and costg>30 and  (df['starttime'].date()>= datetime.date(2012,9,14) ) and ( df['starttime'].date() < datetime.date(2012,9,15) ) :
        return df['name'], df['combined_name'], costg
    elif 'GUEST' not in df['firstname'] and costm>30 and (df['starttime'].date()>= datetime.date(2012,9,14) ) and ( df['starttime'].date() < datetime.date(2012,9,15) )  :
        return df['name'], df['combined_name'], costm
    else: pass

cheap_expensive = merge2.apply(cost, axis=1).dropna()
cheap_expensive.apply(pd.Series, index=["facility", "name", "cost"]).sort_values(by='cost',ascending=False)



# SQL

# SELECT member, facility, cost
# 	FROM (
#         SELECT CONCAT(m.firstname, ' ' , m.surname) as member,
#                f.name as facility,
#         	   CASE
#         			WHEN m.memid = 0 THEN f.guestcost*b.slots
#         			ELSE f.membercost*b.slots
#         	   END as cost
        
#         FROM 
#         	   cd.bookings b INNER JOIN cd.members m
#         	   ON b.memid = m.memid
#         	   INNER JOIN cd.facilities f
#         	   ON f.facid = b.facid
        
#         WHERE 
#                b.starttime >= '2012-09-14' AND
#         	   b.starttime < '2012-09-15'
#         ) as bookings
        
#        WHERE 
#         	cost > 30
        
#        ORDER BY cost DESC;

Unnamed: 0,facility,name,cost
2750,Massage Room 2,GUEST GUEST,320.0
594,Massage Room 1,GUEST GUEST,160.0
595,Massage Room 1,GUEST GUEST,160.0
596,Massage Room 1,GUEST GUEST,160.0
1157,Tennis Court 2,GUEST GUEST,150.0
963,Massage Room 1,Jemima Farrell,140.0
1535,Tennis Court 1,GUEST GUEST,75.0
1534,Tennis Court 1,GUEST GUEST,75.0
1156,Tennis Court 2,GUEST GUEST,75.0
1038,Massage Room 1,Matthew Genting,70.0


In [None]:
facilities_df['name'][facilities_df['facid'] == bookings_db['facid'][4]].to_string(index=False)

In [None]:
bookings_db['facid'][10]

In [None]:
members_df[(members_df['firstname']=='David') & (members_df['surname']=='Farrell')]['memid']

In [None]:
datetime.date(2017,9,1)

In [None]:
f = lambda x: 1 if x>0 else 0 if x ==0 else -1

In [None]:
facilities_df["cost"] = np.where(facilities_df["monthlymaintenance"] < 100, "Cheap", "Expensive")

In [None]:
members_df


In [None]:
bookings_db

In [None]:
np.where([[True, False], [True, True]],[[1, 2], [3, 4]],[[9, 8], [7, 6]])

In [None]:
facilities_df = facilities_df.drop('cost', axis=1)