In [1]:
import numpy as np
import pandas as pd
import pyodbc
import sqlalchemy
import sqlite3

%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [2]:
from subprocess import check_output

conn = sqlite3.connect('/kaggle/input/ipldatabase/database.sqlite')
conn_2 = sqlite3.connect('/kaggle/input/sf-salaries/database.sqlite')
exectue = conn.cursor()
exectue_2 = conn_2.cursor()

### Viewing full database

In [3]:
SQL_season = pd.read_sql(
    """
    SELECT * 
    FROM Season
    """, con = conn)
SQL_season

Unnamed: 0,Season_Id,Man_of_the_Series,Orange_Cap,Purple_Cap,Season_Year
0,1,32,100,102,2008
1,2,53,18,61,2009
2,3,133,133,131,2010
3,4,162,162,194,2011
4,5,315,162,190,2012
5,6,32,19,71,2013
6,7,305,46,364,2014
7,8,334,187,71,2015
8,9,8,8,299,2016


In [4]:
SQL_match = pd.read_sql(
    """
    SELECT * 
    FROM Match
    """, con = conn)
print(SQL_match.shape)
SQL_match.head(10)

(577, 13)


Unnamed: 0,Match_Id,Team_1,Team_2,Match_Date,Season_Id,Venue_Id,Toss_Winner,Toss_Decide,Win_Type,Win_Margin,Outcome_type,Match_Winner,Man_of_the_Match
0,335987,2,1,2008-04-18 00:00:00,1,1,2,1,1,140.0,1,1.0,2.0
1,335988,4,3,2008-04-19 00:00:00,1,2,3,2,1,33.0,1,3.0,19.0
2,335989,6,5,2008-04-19 00:00:00,1,3,5,2,2,9.0,1,6.0,90.0
3,335990,7,2,2008-04-20 00:00:00,1,4,7,2,2,5.0,1,2.0,11.0
4,335991,1,8,2008-04-20 00:00:00,1,5,8,2,2,5.0,1,1.0,4.0
5,335992,5,4,2008-04-21 00:00:00,1,6,4,2,2,6.0,1,5.0,32.0
6,335993,8,6,2008-04-22 00:00:00,1,7,8,2,2,9.0,1,6.0,41.0
7,335994,3,7,2008-04-23 00:00:00,1,8,7,1,1,6.0,1,3.0,18.0
8,335995,8,5,2008-04-24 00:00:00,1,7,5,1,2,3.0,1,5.0,31.0
9,335996,4,7,2008-04-25 00:00:00,1,2,7,1,1,66.0,1,4.0,26.0


In [5]:
SQL_salaries = pd.read_sql(
    """
    SELECT * 
    FROM Salaries
    """, con = conn_2)
print(SQL_salaries.shape)
SQL_salaries.head(10)

(148654, 13)


Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411,,400184.0,,567595,567595,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966,245132.0,137811.0,,538909,538909,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739,106088.0,16452.6,,335280,335280,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916,56120.7,198307.0,,332344,332344,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134402,9737.0,182235.0,,326373,326373,2011,,San Francisco,
5,6,DAVID SULLIVAN,ASSISTANT DEPUTY CHIEF II,118602,8601.0,189083.0,,316286,316286,2011,,San Francisco,
6,7,ALSON LEE,"BATTALION CHIEF, (FIRE DEPARTMENT)",92492,89062.9,134426.0,,315981,315981,2011,,San Francisco,
7,8,DAVID KUSHNER,DEPUTY DIRECTOR OF INVESTMENTS,256577,,51322.5,,307899,307899,2011,,San Francisco,
8,9,MICHAEL MORRIS,"BATTALION CHIEF, (FIRE DEPARTMENT)",176933,86362.7,40132.2,,303428,303428,2011,,San Francisco,
9,10,JOANNE HAYES-WHITE,"CHIEF OF DEPARTMENT, (FIRE DEPARTMENT)",285262,,17115.7,,302378,302378,2011,,San Francisco,


In [6]:
df = SQL_salaries['Status']
len(df[df == 'PT'])

15785

### PRACTICES

In SQL, the **WHERE** keyword allows you to filter based on both text and numeric values in a table. There are a few different comparison operators you can use:

         = "equal"
         != or <> "not equal"
         < "less than"
         > "greater than"
         <= "less than or equal to"
         >= "greater than or equal to"

## 1. Simple filtering : on numeric & text

### 1.1. Simple filtering of numeric values

For example, the following **`query`** selects all details for `season` with the `man_of_series` over one hundred:

        SELECT * FROM Season WHERE Man_of_the_series > 100

In [7]:
SQL_season = pd.read_sql(
    """
    SELECT * FROM Season
    WHERE Man_of_the_Series > 100
    """, con = conn)
SQL_season

Unnamed: 0,Season_Id,Man_of_the_Series,Orange_Cap,Purple_Cap,Season_Year
0,3,133,133,131,2010
1,4,162,162,194,2011
2,5,315,162,190,2012
3,7,305,46,364,2014
4,8,334,187,71,2015


### 1.2. Simple filtering of text

For example, this `query` gets the `teams` of all `seasons` which were `match-days` is `12 th, April 2015`.

**Remember that we must use "ISO date_format": `yyyy-mm-dd` + `hh:mm:ss`**

In [8]:
SQL_season = pd.read_sql(
    """
    SELECT Team_1, Team_2, Match_date
    FROM Match
    WHERE Match_date = '2015-04-12 00:00:00';
    """, con = conn)
SQL_season

Unnamed: 0,Team_1,Team_2,Match_Date
0,6,5,2015-04-12 00:00:00
1,7,4,2015-04-12 00:00:00


### 1.3. Extractin specific `year`, `month` or `date` only

For examlple, we want to explore the information of `team_1` and `team_2` in the `date of match_date = 18` from database `Match`

In [9]:
SQL_season = pd.read_sql(
    """
    SELECT Team_1, Team_2, Match_Date
    FROM Match
    WHERE strftime('%d', Match_Date) = '18'
    """, con = conn)
SQL_season

Unnamed: 0,Team_1,Team_2,Match_Date
0,2,1,2008-04-18 00:00:00
1,8,7,2008-05-18 00:00:00
2,1,3,2008-05-18 00:00:00
3,3,7,2009-04-18 00:00:00
4,2,5,2009-04-18 00:00:00
5,3,1,2009-05-18 00:00:00
6,2,5,2010-03-18 00:00:00
7,4,3,2010-04-18 00:00:00
8,6,8,2010-04-18 00:00:00
9,9,3,2011-04-18 00:00:00


## 2. WHERE AND & WHERE OR

### 2.1. WHERE AND

Often, you'll want to select data based on multiple conditions. You can build up your WHERE queries by combining multiple conditions with the AND keyword.

For example,



In [10]:
SQL_season = pd.read_sql(
    """
    SELECT * FROM Season
    WHERE Man_of_the_Series > 150
    AND Purple_cap > 100
    AND Orange_cap > 50
    """, con = conn)
SQL_season

Unnamed: 0,Season_Id,Man_of_the_Series,Orange_Cap,Purple_Cap,Season_Year
0,4,162,162,194,2011
1,5,315,162,190,2012


gives us the informations of `seasons`-database which satisfied 

                                Man_of_the_Series > 150    
                                Purple_cap > 10    
                                Orange_cap > 50
                                
Another example; using **AND** to extract information of year; month and the man_of_the_match in database `Match`

In [11]:
SQL_season = pd.read_sql(
    """
    SELECT Team_1, 
           Team_2, 
           strftime('%d / %m/ %Y', Match_Date) AS Match_Date_new_format, 
           Man_of_the_match
    FROM Match
    WHERE strftime('%Y', Match_Date) = '2009'
    AND strftime('%m', Match_Date) = '05'
    AND Man_of_the_match > 100
    """, con = conn)
SQL_season

Unnamed: 0,Team_1,Team_2,Match_Date_new_format,Man_of_the_Match
0,1,7,01 / 05/ 2009,154
1,3,6,02 / 05/ 2009,186
2,2,5,07 / 05/ 2009,196
3,2,7,10 / 05/ 2009,154
4,6,1,10 / 05/ 2009,136
5,8,5,11 / 05/ 2009,147
6,6,5,17 / 05/ 2009,110
7,3,1,18 / 05/ 2009,104
8,3,4,20 / 05/ 2009,121
9,2,8,24 / 05/ 2009,124


In this `query`; we has used the keyword **`Alias`(AS)**  in the statement`"strftime('%d / %m/ %Y', Match_Date) AS Match_Date_new_format"` to `rename` the `new_format of date_time` as `dd/mm/yyyy`.

### 2.2.WHERE OR

In [12]:
SQL_match = pd.read_sql(
    """
    SELECT Team_1, Team_2, Toss_Winner, Match_date, Toss_Decide, Man_of_the_Match
    FROM Match
    WHERE Team_1 = Toss_Winner
    OR Man_of_the_Match >= 300
    """, con = conn)
SQL_match

Unnamed: 0,Team_1,Team_2,Toss_Winner,Match_Date,Toss_Decide,Man_of_the_Match
0,2,1,2,2008-04-18 00:00:00,1,2.0
1,7,2,7,2008-04-20 00:00:00,2,11.0
2,8,6,8,2008-04-22 00:00:00,2,41.0
3,1,7,1,2008-04-29 00:00:00,2,44.0
4,5,1,5,2008-05-01 00:00:00,2,101.0
...,...,...,...,...,...,...
318,12,6,12,2016-05-17 00:00:00,1,106.0
319,13,1,13,2016-05-19 00:00:00,1,147.0
320,6,11,6,2016-05-20 00:00:00,1,339.0
321,13,7,13,2016-05-21 00:00:00,1,21.0


### PLAYING WITH WHERE AND(... OR ...)

In [13]:
SQL_match = pd.read_sql(
    """
    SELECT Team_1, Team_2,
           strftime('%d / %m/ %Y', Match_Date) AS Match_Date_new_format,
           Toss_Winner, Toss_Decide, Man_of_the_Match
    FROM Match
    WHERE (strftime('%d', Match_Date) = '18' OR strftime('%d', Match_Date) = '29')
    AND(Man_of_the_Match >= 350 OR Man_of_the_Match <= 50)
    OR(Toss_winner = 5 AND Toss_Decide = 1)
    """, con = conn)
SQL_match

Unnamed: 0,Team_1,Team_2,Match_Date_new_format,Toss_Winner,Toss_Decide,Man_of_the_Match
0,2,1,18 / 04/ 2008,2,1,2.0
1,8,5,24 / 04/ 2008,5,1,31.0
2,2,5,26 / 04/ 2008,5,1,32.0
3,1,7,29 / 04/ 2008,1,2,44.0
4,5,8,09 / 05/ 2008,5,1,31.0
5,5,6,11 / 05/ 2008,5,1,32.0
6,4,5,28 / 05/ 2008,5,1,100.0
7,1,5,20 / 05/ 2008,5,1,31.0
8,5,7,26 / 05/ 2008,5,1,102.0
9,3,5,01 / 06/ 2008,5,1,31.0


## 3. BETWEEN & WHERE IN
### 3.1. BETWEEN.

Similar to the **WHERE** clause, the **BETWEEN** clause can be used with multiple **AND** and **OR** operators, so we can build up your queries and make them even more powerful!

For example, suppose we have a table called `Salaries`. We can get the names of all employees between the time of 2011 and 2014; conditional on the BasePay between in 290000 and 350000 from the United States.

In [14]:
SQL_salaries = pd.read_sql(
    """
    SELECT * 
    FROM Salaries
    WHERE BasePay BETWEEN 290000 AND 350000
    AND Year BETWEEN 2010 AND 2014
    """, con = conn_2)
SQL_salaries.head(10)

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,13,EDWARD HARRINGTON,EXECUTIVE CONTRACT EMPLOYEE,294580.02,,,,294580.02,294580.02,2011,,San Francisco,
1,36161,Gregory Suhr,Chief of Police,302578.0,,18974.1,69810.2,321552.11,391362.3,2012,,San Francisco,
2,36163,Joanne Hayes-White,"Chief, Fire Department",296943.01,,17816.6,72047.9,314759.6,386807.48,2012,,San Francisco,
3,36166,Edward Reiskin,"Gen Mgr, Public Trnsp Dept",294000.17,,,82001.9,294000.17,376002.11,2012,,San Francisco,
4,72926,Gregory P Suhr,Chief of Police,319275.01,,20007.1,86533.2,339282.07,425815.28,2013,,San Francisco,
5,72927,Joanne M Hayes-White,"Chief, Fire Department",313686.01,,23236.0,85431.4,336922.01,422353.4,2013,,San Francisco,
6,72930,Robert L Shaw,"Dep Dir for Investments, Ret",315572.01,,,82849.7,315572.01,398421.67,2013,,San Francisco,
7,72932,Harlan L Kelly-Jr,Executive Contract Employee,313312.52,,,82319.5,313312.52,395632.03,2013,,San Francisco,
8,72933,John L Martin,Dept Head V,311758.96,,1098.64,82476.9,312857.6,395334.45,2013,,San Francisco,
9,72934,Edward D Reiskin,"Gen Mgr, Public Trnsp Dept",305307.89,,,80860.6,305307.89,386168.49,2013,,San Francisco,


Now, using **BETWEEN** with **AND** on the `Salaries` data to get the :`Id, JobTitle, EmployeeName, Basepay & Totalpay` of the **occupation:** `'PHARMACY TECHNICIAN' or 'Transit Operator'` satisfied the **basepay** in [59000, 65000] while **Totalpay** exceeded 80000.

In [15]:
SQL_salaries = pd.read_sql(
    """
        SELECT Id, JobTitle, Year, EmployeeName, BasePay, TotalPay
        FROM Salaries
        WHERE BasePay BETWEEN 59000 AND 60000
        AND TotalPay >= 80000
        AND (JobTitle = 'PHARMACY TECHNICIAN' OR JobTitle = 'Transit Operator')
    """, con = conn_2)
SQL_salaries

Unnamed: 0,Id,JobTitle,Year,EmployeeName,BasePay,TotalPay
0,48570,Transit Operator,2012,Gregory Johns,59558.7,88046.14
1,48759,Transit Operator,2012,Santos Bellorin,59665.25,88584.79
2,49816,Transit Operator,2012,Weilin Yu,59861.42,83816.39
3,50154,Transit Operator,2012,Gretchen Alexander,59885.45,82567.73
4,50155,Transit Operator,2012,Anderson Cheng,59255.6,82965.09
5,50508,Transit Operator,2012,Michael Wood,59996.15,80352.99
6,50574,Transit Operator,2012,Arthur Johnson,59187.03,81044.65
7,50686,Transit Operator,2012,Flor De Mari Segura,59798.96,80823.62
8,50996,Transit Operator,2012,Ronald Turner,59065.7,80030.7
9,87168,Transit Operator,2013,Terry L Sampson,59971.06,88411.71


### 3.2. WHERE IN.
As you've seen, **WHERE** is very useful for `filtering` results. However, if you want to filter based on many conditions, **WHERE** can get unwieldy. For example:


In [16]:
SQL_salaries = pd.read_sql(
    """
        SELECT Id, JobTitle, Year, EmployeeName, BasePay, TotalPay
        FROM Salaries
        WHERE JobTitle IN ('Chief of Police', 'Chief, Fire Department', 'ASSISTANT DEPUTY CHIEF II', 'BATTALION CHIEF, (FIRE DEPARTMENT)')
        AND BasePay > 117000
        AND TotalPay > 216000
    """, con = conn_2)
SQL_salaries

Unnamed: 0,Id,JobTitle,Year,EmployeeName,BasePay,TotalPay
0,6,ASSISTANT DEPUTY CHIEF II,2011,DAVID SULLIVAN,118602.0,316285.74
1,9,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,MICHAEL MORRIS,176932.64,303427.55
2,15,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,DAVID FRANKLIN,174872.64,286347.05
3,19,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,MARTY ROSS,168692.63,276434.22
4,24,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,VICTOR WYRSCH,168692.63,270672.63
5,28,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,RAYMOND GUZMAN,168692.59,265463.46
6,35,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,JOSE VELO,177781.25,258364.22
7,40,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,BRENDAN WARD,147158.85,251685.16
8,41,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,MICHAEL THOMPSON,168692.66,250595.39
9,42,"BATTALION CHIEF, (FIRE DEPARTMENT)",2011,THOMAS ABBOTT,168692.63,250408.49


## 4.Introduction to NULL and IS NULL

In SQL, NULL represents a missing or unknown value. You can check for NULL values using the expression IS NULL. For example, to count the number of missing `overtime_pay` in the `salaries` table:

In [17]:
SQL_salaries = pd.read_sql(
    """
        SELECT Count(OvertimePay)
        FROM Salaries
        WHERE OvertimePay IS NULL
    """, con = conn_2)
SQL_salaries

Unnamed: 0,Count(OvertimePay)
0,77321


As you can see, **IS NULL** is useful when combined with **WHERE** to figure out what data you're missing.

Sometimes, you'll want to filter out missing values so you only get results which are not **NULL**. To do this, you can use the **IS NOT NULL** operator.

For example, the following query gives the `Overtimepay` of all employee did not missing in the people table.

In [18]:
SQL_salaries = pd.read_sql(
    """
        SELECT Count(OvertimePay)
        FROM Salaries
        WHERE OvertimePay IS NOT NULL
    """, con = conn_2)
SQL_salaries

Unnamed: 0,Count(OvertimePay)
0,71332


This indicates that there are 71332 observations in `Salaries` data didn't got the missing-values at column `Overtime-Pay`

In [19]:
SQL_salaries = pd.read_sql(
    """
        SELECT Count(Status)
        FROM Salaries
        WHERE Status IS NOT NULL
    """, con = conn_2)
SQL_salaries

Unnamed: 0,Count(Status)
0,15785


Hence, only 15787 observations in `salaries database` have the available `Status` (look back line In [6])

## 5. LIKE & NOT LIKE

As you've seen, the **WHERE clause** can be used to filter **`text data`**. 

However, so far you've only been able to filter by specifying the exact text you're interested in. In the real world, often you'll want to search for a pattern rather than a `specific text string`.

In SQL, the **LIKE** operator can be used in a **WHERE clause to search for a pattern in a column**. To accomplish this, you use something called a wildcard as a placeholder for some other values. There are two wildcards you can use with LIKE:

The `% wildcard` will match `zero`, `one`, or `many characters in text`. 

For example, the following query matches companies like `'Nurse', 'Nurse Manager', 'Nurse Preactitioner'` and so on:

In [20]:
SQL_salaries = pd.read_sql(
    """
        SELECT EmployeeName, JobTitle
        FROM Salaries
        WHERE JobTitle LIKE 'Nurse%'
        AND EmployeeName LIKE 'T%'
    """, con = conn_2)
SQL_salaries

Unnamed: 0,EmployeeName,JobTitle
0,TROY WILLIAMS,NURSE MANAGER
1,TWYILA LAY,NURSE PRACTITIONER
2,TAE-WOL STANLEY,NURSE PRACTITIONER
3,TERRI ELLENBERG,NURSE MANAGER
4,TERESITA BALUYUT,NURSE MANAGER
5,TINA KING,NURSE PRACTITIONER
6,TAMARA OOMS,NURSE PRACTITIONER
7,THOMAS HOLTON,NURSE MANAGER
8,TAMAR BESSON,NURSE PRACTITIONER
9,THOMAS FARLEY,NURSE PRACTITIONER


The `_ wildcard` will match a single character. 

For example, the following query **'C_m %'** matches `JobTitle` like `'Commander', 'Camp', 'Communication'`while **NOT LIKE "N%"** will return the `EmployeeName` that didn't starting by the letter `N`

In [21]:
SQL_salaries = pd.read_sql(
    """
        SELECT EmployeeName, JobTitle
        FROM Salaries
        WHERE JobTitle LIKE 'C_m%'
        AND EmployeeName NOT LIKE 'N%'
    """, con = conn_2)
SQL_salaries

Unnamed: 0,EmployeeName,JobTitle
0,RICHARD CORRIEA,"COMMANDER III, (POLICE DEPARTMENT)"
1,SANDRA TONG,"COMMANDER III, (POLICE DEPARTMENT)"
2,L MILITELLO,"COMMANDER III, (POLICE DEPARTMENT)"
3,MICHAEL BIEL,"COMMANDER III, (POLICE DEPARTMENT)"
4,MIKAIL ALI,"COMMANDER III, (POLICE DEPARTMENT)"
...,...,...
1717,Addison G Wright,Camp Assistant
1718,Marshall Robles,Community Police Services Aide
1719,Larry Liederman,Commissioner No Benefits
1720,Andre M Johnson,Communications Dispatcher 1
