## Pandas

In [1]:
import numpy as np
import pandas as pd
pd.__version__
pd.set_option("display.notebook_repr_html", False)

In [2]:
# Load the data
weather = pd.read_csv("./nycflights13_weather.csv.gz", comment = "#")
airlines = pd.read_csv("./nycflights13_airlines.csv.gz", comment = "#")
planes = pd.read_csv("./nycflights13_planes.csv.gz", comment = "#")
flights = pd.read_csv("./nycflights13_flights.csv.gz", comment = "#")
airports = pd.read_csv("./nycflights13_airports.csv.gz", comment = "#")

In [3]:
import sqlite3
conn = sqlite3.connect("/tmp/nycflights13.db")
airports.to_sql("airports", conn)
flights.to_sql("flights",conn)
weather.to_sql("weather",conn)
planes.to_sql("planes",conn)
airlines.to_sql("airlines",conn)

 `SELECT manufacturer, COUNT(*) FROM planes WHERE seats > 200 GROUP BY manufacturer`


In [4]:
pd.read_sql_query("""
SELECT manufacturer, COUNT(*) FROM planes WHERE seats > 200 GROUP BY manufacturer
""",conn)

       manufacturer  COUNT(*)
0            AIRBUS        66
1  AIRBUS INDUSTRIE         4
2            BOEING       225

In [60]:
planes.loc[planes.seats>200].groupby("manufacturer").size().reset_index()

       manufacturer    0
0            AIRBUS   66
1  AIRBUS INDUSTRIE    4
2            BOEING  225

 `SELECT manufacturer, COUNT(*) FROM planes GROUP BY manufacturer HAVING COUNT(*) > 10`

In [8]:
pd.read_sql_query("""
SELECT manufacturer, COUNT(*) FROM planes GROUP BY manufacturer HAVING COUNT(*) > 10
 """,conn)

                    manufacturer  COUNT(*)
0                         AIRBUS       336
1               AIRBUS INDUSTRIE       400
2                         BOEING      1630
3                 BOMBARDIER INC       368
4                        EMBRAER       299
5              MCDONNELL DOUGLAS       120
6  MCDONNELL DOUGLAS AIRCRAFT CO       103
7  MCDONNELL DOUGLAS CORPORATION        14

In [61]:
x = planes.groupby("manufacturer").size().rename("count")
x.loc[x>10].reset_index()

                    manufacturer  count
0                         AIRBUS    336
1               AIRBUS INDUSTRIE    400
2                         BOEING   1630
3                 BOMBARDIER INC    368
4                        EMBRAER    299
5              MCDONNELL DOUGLAS    120
6  MCDONNELL DOUGLAS AIRCRAFT CO    103
7  MCDONNELL DOUGLAS CORPORATION     14

`SELECT manufacturer, COUNT(*) FROM planes WHERE seats > 200 GROUP BY manufacturer HAVING COUNT(*) > 10`

In [19]:
pd.read_sql_query("""
SELECT manufacturer, COUNT(*) FROM planes WHERE seats > 200 GROUP BY manufacturer HAVING COUNT(*) > 10
""",conn)

  manufacturer  COUNT(*)
0       AIRBUS        66
1       BOEING       225

In [67]:
x = planes.loc[planes.seats>200].groupby("manufacturer").size().rename("count")
x.loc[x>10].reset_index()

  manufacturer  count
0       AIRBUS     66
1       BOEING    225

`SELECT manufacturer, COUNT(*) AS howmany FROM planes GROUP BY manufacturer ORDER BY howmany`

In [26]:
pd.read_sql_query("""
SELECT manufacturer, COUNT(*) AS howmany FROM planes GROUP BY manufacturer ORDER BY howmany
""",conn)

                     manufacturer  howmany
0                      AGUSTA SPA        1
1              AVIAT AIRCRAFT INC        1
2          AVIONS MARCEL DASSAULT        1
3                   BARKER JACK L        1
4                    CANADAIR LTD        1
5              CIRRUS DESIGN CORP        1
6                     DEHAVILLAND        1
7                         DOUGLAS        1
8                  FRIEDEMANN JON        1
9              HURLEY JAMES LARRY        1
10                    JOHN G HESS        1
11                   KILDALL GARY        1
12                LAMBERT RICHARD        1
13                    LEARJET INC        1
14                LEBLANC GLENN T        1
15                     MARZ BARRY        1
16                    PAIR MIKE E        1
17         ROBINSON HELICOPTER CO        1
18                       SIKORSKY        1
19          AMERICAN AIRCRAFT INC        2
20                          BEECH        2
21                           BELL        2
22         

In [68]:
planes.groupby("manufacturer").size().sort_values().rename("howmany").reset_index()

                     manufacturer  howmany
0                      AGUSTA SPA        1
1                         DOUGLAS        1
2                     DEHAVILLAND        1
3              CIRRUS DESIGN CORP        1
4              HURLEY JAMES LARRY        1
5                    CANADAIR LTD        1
6                     JOHN G HESS        1
7                    KILDALL GARY        1
8                        SIKORSKY        1
9                 LAMBERT RICHARD        1
10                LEBLANC GLENN T        1
11                  BARKER JACK L        1
12         AVIONS MARCEL DASSAULT        1
13             AVIAT AIRCRAFT INC        1
14                     MARZ BARRY        1
15                    PAIR MIKE E        1
16         ROBINSON HELICOPTER CO        1
17                    LEARJET INC        1
18                 FRIEDEMANN JON        1
19                   STEWART MACO        2
20                           BELL        2
21                          BEECH        2
22         

`SELECT manufacturer, COUNT(*) AS howmany FROM planes GROUP BY manufacturer ORDER BY howmany DESC LIMIT 10`

In [35]:
pd.read_sql_query("""
SELECT manufacturer, COUNT(*) AS howmany FROM planes GROUP BY manufacturer ORDER BY howmany DESC LIMIT 10
""",conn)

                    manufacturer  howmany
0                         BOEING     1630
1               AIRBUS INDUSTRIE      400
2                 BOMBARDIER INC      368
3                         AIRBUS      336
4                        EMBRAER      299
5              MCDONNELL DOUGLAS      120
6  MCDONNELL DOUGLAS AIRCRAFT CO      103
7  MCDONNELL DOUGLAS CORPORATION       14
8                       CANADAIR        9
9                         CESSNA        9

In [69]:
planes.groupby("manufacturer").size().sort_values(ascending=False).rename("howmany").reset_index().head(10)

                    manufacturer  howmany
0                         BOEING     1630
1               AIRBUS INDUSTRIE      400
2                 BOMBARDIER INC      368
3                         AIRBUS      336
4                        EMBRAER      299
5              MCDONNELL DOUGLAS      120
6  MCDONNELL DOUGLAS AIRCRAFT CO      103
7  MCDONNELL DOUGLAS CORPORATION       14
8                       CANADAIR        9
9                         CESSNA        9

`SELECT * FROM planes WHERE year >= 2012 ORDER BY year ASC, seats DESC`

In [40]:
pd.read_sql_query("""
SELECT * FROM planes WHERE year >= 2012 ORDER BY year ASC, seats DESC LIMIT 10
""",conn)

   index tailnum    year                     type manufacturer     model  \
0   1637  N555AY  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
1   1646  N556UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
2   1652  N557UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
3   1657  N558UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
4   1662  N559UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
5   1666  N560UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
6   1670  N561UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
7   1674  N562UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
8   1678  N563UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
9   1683  N564UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   

   engines  seats speed     engine  
0        2    379  None  Turbo-fan  
1        2    379  None  Turbo-fan  
2        2    379  None  Turbo-fan  
3        2    3

In [49]:
planes[planes.year >= 2012].sort_values(by=["year","seats"],ascending=[True,False],kind="mergesort").head(10)

     tailnum    year                     type manufacturer     model  engines  \
1637  N555AY  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1646  N556UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1652  N557UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1657  N558UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1662  N559UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1666  N560UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1670  N561UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1674  N562UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1678  N563UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1683  N564UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   

      seats  speed     engine  
1637    379    NaN  Turbo-fan  
1646    379    NaN  Turbo-fan  
1652    379 

`SELECT * FROM planes WHERE year >= 2012 ORDER BY seats DESC, year ASC`

In [50]:
pd.read_sql_query("""
SELECT * FROM planes WHERE year >= 2012 ORDER BY seats DESC, year ASC LIMIT 10
""",conn)

   index tailnum    year                     type manufacturer     model  \
0   1637  N555AY  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
1   1646  N556UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
2   1652  N557UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
3   1657  N558UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
4   1662  N559UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
5   1666  N560UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
6   1670  N561UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
7   1674  N562UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
8   1678  N563UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   
9   1683  N564UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231   

   engines  seats speed     engine  
0        2    379  None  Turbo-fan  
1        2    379  None  Turbo-fan  
2        2    379  None  Turbo-fan  
3        2    3

In [51]:
planes[planes.year >= 2012].sort_values(by=["seats","year"],ascending=[False,True],kind="mergesort").head(10)

     tailnum    year                     type manufacturer     model  engines  \
1637  N555AY  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1646  N556UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1652  N557UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1657  N558UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1662  N559UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1666  N560UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1670  N561UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1674  N562UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1678  N563UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   
1683  N564UW  2012.0  Fixed wing multi engine       AIRBUS  A321-231        2   

      seats  speed     engine  
1637    379    NaN  Turbo-fan  
1646    379    NaN  Turbo-fan  
1652    379 

`SELECT * FROM flights LEFT JOIN planes on flights.tailnum=planes.tailnum`

In [121]:
pd.merge(flights,planes,how="left",on="tailnum").head()

   year_x  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0    2013      1    1     517.0             515        2.0     830.0   
1    2013      1    1     533.0             529        4.0     850.0   
2    2013      1    1     542.0             540        2.0     923.0   
3    2013      1    1     544.0             545       -1.0    1004.0   
4    2013      1    1     554.0             600       -6.0     812.0   

   sched_arr_time  arr_delay carrier    ...      minute            time_hour  \
0             819       11.0      UA    ...          15  2013-01-01 05:00:00   
1             830       20.0      UA    ...          29  2013-01-01 05:00:00   
2             850       33.0      AA    ...          40  2013-01-01 05:00:00   
3            1022      -18.0      B6    ...          45  2013-01-01 05:00:00   
4             837      -25.0      DL    ...           0  2013-01-01 06:00:00   

   year_y                     type  manufacturer     model  engines  seats  \
0  1999.

`SELECT planes.*, airlines.* FROM 
   (SELECT DISTINCT carrier, tailnum FROM flights) AS cartail 
JOIN planes ON cartail.tailnum=planes.tailnum 
JOIN airlines ON cartail.carrier=airlines.carrier`

   

In [88]:
cartail = flights.loc[:,["carrier","tailnum"]].drop_duplicates()
pd.merge(pd.merge(planes,cartail,on="tailnum"),airlines,on="carrier").head()

  tailnum    year                     type manufacturer      model  engines  \
0  N10156  2004.0  Fixed wing multi engine      EMBRAER  EMB-145XR        2   
1  N10575  2002.0  Fixed wing multi engine      EMBRAER  EMB-145LR        2   
2  N11106  2002.0  Fixed wing multi engine      EMBRAER  EMB-145XR        2   
3  N11107  2002.0  Fixed wing multi engine      EMBRAER  EMB-145XR        2   
4  N11109  2002.0  Fixed wing multi engine      EMBRAER  EMB-145XR        2   

   seats  speed     engine carrier                      name  
0     55    NaN  Turbo-fan      EV  ExpressJet Airlines Inc.  
1     55    NaN  Turbo-fan      EV  ExpressJet Airlines Inc.  
2     55    NaN  Turbo-fan      EV  ExpressJet Airlines Inc.  
3     55    NaN  Turbo-fan      EV  ExpressJet Airlines Inc.  
4     55    NaN  Turbo-fan      EV  ExpressJet Airlines Inc.  

`SELECT flights2.*, weather2.atemp, weather2.ahumid, weather2.apressure FROM 
       (SELECT * FROM flights WHERE origin='EWR') AS flights2 
    LEFT JOIN 
       (SELECT year, month, day, AVG(temp) AS atemp, 
         AVG(humid) AS ahumid, AVG(pressure) AS apressure 
        FROM weather WHERE origin='EWR' GROUP BY year, month, day) AS weather2 
    ON flights2.year=weather2.year 
      AND flights2.month=weather2.month 
      AND flights2.day=weather2.day`

In [135]:
flights2 = flights.loc[flights.origin=="EWR"]
weather2 = weather.loc[weather.origin=="EWR",
                      ["year","month","day","temp","humid","pressure"]].groupby(["year","month","day"]).agg([np.mean])
weather2.columns=["atemp","ahumid","apressure"]
weather2 = weather2.reset_index()
pd.merge(flights2,weather2,how="left",on=["year","month","day"]).head(2)

   year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0  2013      1    1     517.0             515        2.0     830.0   
1  2013      1    1     554.0             558       -4.0     740.0   

   sched_arr_time  arr_delay carrier     ...       origin dest air_time  \
0             819       11.0      UA     ...          EWR  IAH    227.0   
1             728       12.0      UA     ...          EWR  ORD    150.0   

  distance  hour  minute            time_hour  atemp     ahumid    apressure  
0     1400     5      15  2013-01-01 05:00:00  38.48  58.386087  1012.442857  
1      719     5      58  2013-01-01 05:00:00  38.48  58.386087  1012.442857  

[2 rows x 22 columns]

### Stack (melt) / Unstack (cast)

In [137]:
import seaborn as sns
flights = sns.load_dataset("flights")
flights.head()

   year     month  passengers
0  1949   January         112
1  1949  February         118
2  1949     March         132
3  1949     April         129
4  1949       May         121

In [146]:
cast = flights.set_index(["year","month"]).unstack()
cast.head()

      passengers                                                               \
month    January February March April  May June July August September October   
year                                                                            
1949         112      118   132   129  121  135  148    148       136     119   
1950         115      126   141   135  125  149  170    170       158     133   
1951         145      150   178   163  172  178  199    199       184     162   
1952         171      180   193   181  183  218  230    242       209     191   
1953         196      196   236   235  229  243  264    272       237     211   

                         
month November December  
year                     
1949       104      118  
1950       114      140  
1951       146      166  
1952       172      194  
1953       180      201  

In [148]:
melt = cast.stack().reset_index()
melt.head()

   year     month  passengers
0  1949   January         112
1  1949  February         118
2  1949     March         132
3  1949     April         129
4  1949       May         121

## Webscraping

In [8]:
page = "https://en.wikipedia.org/wiki/Berlin"
h = pd.read_html(page)
type(h)

list

In [9]:
h[2]

                                                   0             1   \
0   Climate data for Berlin- Tempelhof (1971–2000)...           NaN   
1                                               Month           Jan   
2                                 Record high °C (°F)   15.5 (59.9)   
3                                Average high °C (°F)    3.3 (37.9)   
4                                  Daily mean °C (°F)    0.6 (33.1)   
5                                 Average low °C (°F)   −1.9 (28.6)   
6                                  Record low °C (°F)  −23.1 (−9.6)   
7                        Average rainfall mm (inches)  42.3 (1.665)   
8                       Average rainy days (≥ 1.0 mm)          10.0   
9                         Mean monthly sunshine hours          46.5   
10  Source: World Meteorological Organization (UN)...           NaN   

               2             3             4             5             6   \
0             NaN           NaN           NaN           NaN           

In [11]:
import requests
import lxml.html, cssselect, html5lib

In [15]:
r = requests.get("https://en.wikipedia.org/wiki/Berlin")
src = r.text
src[:100]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

In [17]:
doc = lxml.html.fromstring(src)
doc

<Element html at 0x7fdd5bc0e228>

In [18]:
doc.getchildren()

[<Element head at 0x7fdd5bc71638>, <Element body at 0x7fdd5bc0e048>]

In [22]:
e = doc.cssselect("div.toc")[0]
e

<Element div at 0x7fdd5bc0e908>

In [36]:
print(e.text_content())



Contents


1 History

1.1 Etymology
1.2 12th to 16th centuries
1.3 17th to 19th centuries
1.4 20th to 21st centuries


2 Geography

2.1 Topography
2.2 Climate
2.3 Cityscape
2.4 Architecture


3 Demographics

3.1 International communities
3.2 Languages
3.3 Religion


4 Government

4.1 City state
4.2 Boroughs
4.3 Twin towns – sister cities
4.4 Capital city


5 Economy

5.1 Companies
5.2 Tourism and conventions
5.3 Creative industries
5.4 Media


6 Infrastructure

6.1 Transport
6.2 Energy
6.3 Health
6.4 Telecommunication


7 Education

7.1 Higher education
7.2 Research


8 Culture

8.1 Galleries and museums
8.2 Nightlife and festivals
8.3 Performing arts
8.4 Cuisine
8.5 Recreation
8.6 Sports


9 See also
10 Notes
11 References
12 External links




TypeError: 'NoneType' object is not subscriptable

In [34]:
[f.text_content() for f in e.cssselect("li li")][:5] # li children of another li

['1.1 Etymology',
 '1.2 12th to 16th centuries',
 '1.3 17th to 19th centuries',
 '1.4 20th to 21st centuries',
 '2.1 Topography']

In [35]:
# <a href="link">label</a>
[(a.text_content(),a.attrib.get("href"))
for a in doc.cssselect("a")][:5]

[('', None),
 ('navigation', '#mw-head'),
 ('search', '#p-search'),
 ('Berlin (disambiguation)', '/wiki/Berlin_(disambiguation)'),
 ('State of Germany', '/wiki/States_of_Germany')]

## String operations

In [41]:
x = pd.Series(["ni","a knight",None,"123"])
x

0          ni
1    a knight
2        None
3         123
dtype: object

In [44]:
x.str.isalpha() # check if a string is made of alphabetic chars only
# isdigit ...

0     True
1    False
2     None
3    False
dtype: object

In [45]:
x.str.upper()

0          NI
1    A KNIGHT
2        None
3         123
dtype: object

In [46]:
x.str.len()

0    2.0
1    8.0
2    NaN
3    3.0
dtype: float64

In [47]:
x.str.repeat(2)
# equivalent to x+x

0                nini
1    a knighta knight
2                None
3              123123
dtype: object

In [48]:
x.str.cat(sep=", ") # concatenation

'ni, a knight, 123'

In [49]:
x.str.slice(0,2) # substrings

0      ni
1      a 
2    None
3      12
dtype: object

In [50]:
x.str.slice_replace(-1, repl="XYZ")

0          nXYZ
1    a knighXYZ
2          None
3         12XYZ
dtype: object

In [None]:
# split, findall, count, contains, extract, extractall, replace

## Regular expressions
split, findall, count, contains, extract, extractall, replace

In [52]:
import re
berlin = pd.read_html("https://en.wikipedia.org/wiki/Berlin")[2]

In [55]:
pd.Series(["Ni!", "Ni, ni!", "Ni-whom"]).str.findall("Ni",re.IGNORECASE)

0        [Ni]
1    [Ni, ni]
2        [Ni]
dtype: object

In [58]:
pd.Series(["na na nu ni ne na ni"]).str.findall("n[ai]")

0    [na, na, ni, na, ni]
dtype: object

In [66]:
# \d = ASCII digits
# \D = negation of \d = [^\d]
# \w = word characters (mostly letters and digits)
# \s = spaces
# \W, \S = negation of the above
# [a-z]
pd.Series(["na na nu ni ne na ni"]).str.findall("\w\w")

0    [na, na, nu, ni, ne, na, ni]
dtype: object

In [67]:
# . matches everyhing except \n
pd.Series(["na na nu ni ne na ni"]).str.findall(r"..")

0    [na,  n, a , nu,  n, i , ne,  n, a , ni]
dtype: object

In [69]:
# | is the alternative sign
pd.Series(["na na nu ni ne na ni", "na na na"]).str.findall(r"na na|nu")

0    [na na, nu]
1        [na na]
dtype: object

In [73]:
# quantifiers
# * repeat 0 or more times
# + repeat 1 or more times
# ? either 0 or 1
# {n,m} repeat from m to n times
pd.Series("ni! ninini! nnnniiiiii! n n nu! ni!").str.findall(r"ni+")

0    [ni, ni, ni, ni, niiiiii, ni]
dtype: object

In [153]:
pd.Series("ni! ninini! nnnniiiiii! n n nu! ni!").str.findall(r"(ni+)")

0    [ni, ni, ni, ni, niiiiii, ni]
dtype: object

In [None]:
# \b matches at a word boundary, i.e. where \W preceeds \w or \w preceeds \W

### Exercise
Extract celsius temperature from berlin df

In [193]:
x = berlin.iloc[ [2,6] ]
x.columns = berlin.iloc[1]
x = x.set_index("Month")
x = x.T
x.index.name = None
x

Month Record high °C (°F) Record low °C (°F)
Jan           15.5 (59.9)       −23.1 (−9.6)
Feb           18.7 (65.7)      −26.0 (−14.8)
Mar           24.8 (76.6)        −16.5 (2.3)
Apr           31.3 (88.3)        −8.1 (17.4)
May           35.5 (95.9)        −4.0 (24.8)
Jun           35.9 (96.6)         1.5 (34.7)
Jul          38.1 (100.6)         5.4 (41.7)
Aug          38.0 (100.4)         3.5 (38.3)
Sep           34.2 (93.6)        −1.5 (29.3)
Oct           28.1 (82.6)        −9.6 (14.7)
Nov           20.5 (68.9)        −16.0 (3.2)
Dec           16.0 (60.8)       −20.5 (−4.9)
Year         38.1 (100.6)      −26.0 (−14.8)

In [194]:
for i in range(2):
    x.iloc[:,i] = x.iloc[:,i].str.replace("−","-")
    x.iloc[:,i] = x.iloc[:,i].str.findall(r"-?\d+\.\d+").str.get(0).astype("float")
x

Month  Record high °C (°F)  Record low °C (°F)
Jan                   15.5               -23.1
Feb                   18.7               -26.0
Mar                   24.8               -16.5
Apr                   31.3                -8.1
May                   35.5                -4.0
Jun                   35.9                 1.5
Jul                   38.1                 5.4
Aug                   38.0                 3.5
Sep                   34.2                -1.5
Oct                   28.1                -9.6
Nov                   20.5               -16.0
Dec                   16.0               -20.5
Year                  38.1               -26.0

## Cython

www.cython.org

Automated way to translate python code to C or C++ and usualy compile it to produce .dll and .so, which helps with optimization

In [3]:
%load_ext Cython

In [4]:
def pi1(n):
    s = 1.0
    a = 1.0
    b = 1.0
    for i in range(1, n+1):
        a = -a
        b += 2.0
        s += a / b
    return 4.0*s
pi1(10**7)

3.1415927535897814

In [5]:
import numpy as np
def pi2(n):
    s = 1.0/(np.arange(0,n+1)*2.0 + 1.0)
    s[1::2] = -s[1::2]
    return 4.0*np.sum(s)
pi2(10**7)

3.1415927535897881

In [6]:
%timeit pi1(10**7)
%timeit pi2(10**7)

1 loop, best of 3: 916 ms per loop
10 loops, best of 3: 124 ms per loop


As expected numpy is much faster than standard python

In [7]:
%%cython -a
def pi3(n):
    s = 1.0
    a = 1.0
    b = 1.0
    for i in range(1, n+1):
        a = -a
        b += 2.0
        s += a / b
    return 4.0*s

In [8]:
pi3(10**7)

3.1415927535897814

In [9]:
%timeit pi3(10**7)

1 loop, best of 3: 166 ms per loop


We already manage to improve performance of pi1 by only using cython, but still slower than numpy

In [10]:
%%cython -a
cpdef pi4(int n):
    cdef double s = 1.0
    cdef double a = 1.0
    cdef double b = 1.0
    cdef int i
    for i in range(1, n+1):
        a = -a
        b += 2.0
        s += a / b
    return 4.0*s

In [11]:
pi4(10**7)

3.1415927535897814

In [12]:
%timeit pi4(10**7)

100 loops, best of 3: 15 ms per loop


Now we beat numpy

In [221]:
%timeit pi1(10**7)
%timeit pi2(10**7)
%timeit pi3(10**7)
%timeit pi4(10**7)

1 loop, best of 3: 1.01 s per loop
10 loops, best of 3: 91.4 ms per loop
10 loops, best of 3: 168 ms per loop
100 loops, best of 3: 14.1 ms per loop


### Cython exercise

In [13]:
np.random.seed(123)
x = np.random.rand(10000)

def quadratic_mean1(x):
    return np.sqrt(np.mean(x**2))

In [17]:
%%cython -a
import numpy as np
cimport numpy as np

cpdef double quadratic_mean2(np.ndarray[np.double_t] x):
    cdef unsigned int i, n = len(x)
    cdef double sum_ = 0
    for i in range(n):
        sum_ += x[i] * x[i]
    return  (sum_/n) ** 0.5

In [18]:
%timeit quadratic_mean1(x)
%timeit quadratic_mean2(x)

The slowest run took 37.65 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 19.2 µs per loop
100000 loops, best of 3: 14.6 µs per loop


Defining ndarray with cython:

In [None]:
# np.ndarray[np.double_t, ndim=2] A
# A.shape[0]
# A.shape[1]
# A[i,j]