In [8]:
import os
import pandas as pd
import numpy as np
import numexpr

# 1) High Performance Pandas : eval() and query()

## eval()

In [2]:
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)
%timeit x + y

4.15 ms ± 350 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [3]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

522 ms ± 45.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
df1 = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])

In [5]:
df1 < df2

Unnamed: 0,a,b,c
0,True,False,False
1,True,True,True
2,True,True,True
3,False,False,True
4,True,False,False
5,False,False,False
6,False,True,True
7,False,False,False
8,False,False,True
9,True,False,False


In [6]:
%timeit pd.eval('df1 < df2')

1.9 ms ± 229 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
%timeit df1 < df2

228 µs ± 33.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [8]:
df1.eval('b > 0')

0    False
1     True
2    False
3     True
4     True
5    False
6    False
7     True
8     True
9     True
dtype: bool

In [9]:
#same as
df1.b > 0

0    False
1     True
2    False
3     True
4     True
5    False
6    False
7     True
8     True
9     True
Name: b, dtype: bool

In [10]:
nrows, ncols = 3000, 10
df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]

In [11]:
%timeit df1 + df2 + df3 + df4

1.73 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
%timeit pd.eval('df1 + df2 + df3 + df4')

5.81 ms ± 1.47 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
3.29/0.395

8.329113924050633

In [14]:
%timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)

2.89 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')

5.41 ms ± 264 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
s = pd.Series(np.random.randn(50))

In [17]:
%timeit df1 + df2 + df3 + df4 + s

4.78 ms ± 882 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
%timeit pd.eval('df1 + df2 + df3 + df4 + s')

12.6 ms ± 580 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [19]:
nrows, ncols = 30, 10
df1, df2, df3, df4 = [pd.DataFrame(np.random.randn(nrows, ncols)) for _ in range(4)]

In [20]:
%timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)

937 µs ± 114 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [21]:
%timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')

4.25 ms ± 335 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
%timeit df1 + df2 + df3 + df4

425 µs ± 33.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%timeit pd.eval('df1 + df2 + df3 + df4')

3.35 ms ± 207 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## query()

In [24]:
df1 = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
df1

Unnamed: 0,a,b,c
0,-0.824293,-1.367652,-0.903301
1,0.723733,1.794655,-1.158653
2,-0.061726,-1.812006,-1.299514
3,-0.282544,0.942839,1.01332
4,0.445224,-0.326441,-1.530032
5,-0.943526,-0.078837,-0.988815
6,1.090133,-2.695771,1.179466
7,0.633362,-0.08169,1.834691
8,-0.217118,-0.098014,0.139371
9,-0.831414,-0.075935,1.785776


In [25]:
df1.eval('b > 0')

0    False
1     True
2    False
3     True
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [26]:
df1[df1.eval('b > 0')]

Unnamed: 0,a,b,c
1,0.723733,1.794655,-1.158653
3,-0.282544,0.942839,1.01332


In [27]:
#df1 = pd.DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
df1.query('b > 0')

Unnamed: 0,a,b,c
1,0.723733,1.794655,-1.158653
3,-0.282544,0.942839,1.01332


In [28]:
df1[df1.b > 0]

Unnamed: 0,a,b,c
1,0.723733,1.794655,-1.158653
3,-0.282544,0.942839,1.01332


In [29]:
df1.query('a < b')

Unnamed: 0,a,b,c
1,0.723733,1.794655,-1.158653
3,-0.282544,0.942839,1.01332
5,-0.943526,-0.078837,-0.988815
8,-0.217118,-0.098014,0.139371
9,-0.831414,-0.075935,1.785776


# 2) lookup() and get()

In [30]:
seasons = pd.DataFrame(np.random.random((6,4)), columns=['winter','spring','summer','autumn'])
seasons

Unnamed: 0,winter,spring,summer,autumn
0,0.674768,0.076788,0.938692,0.466123
1,0.933337,0.591809,0.148437,0.32416
2,0.986469,0.428797,0.021714,0.384719
3,0.373602,0.211572,0.937287,0.392431
4,0.68903,0.979996,0.385363,0.922984
5,0.349691,0.624754,0.464635,0.642961


In [31]:
seasons.melt(value_vars=['winter', 'spring', 'summer', 'autumn'])

Unnamed: 0,variable,value
0,winter,0.674768
1,winter,0.933337
2,winter,0.986469
3,winter,0.373602
4,winter,0.68903
5,winter,0.349691
6,spring,0.076788
7,spring,0.591809
8,spring,0.428797
9,spring,0.211572


In [32]:
lookup = ['summer','winter','spring','summer','autumn','winter']
seasons.lookup(seasons.index, lookup)

array([0.93869236, 0.93333747, 0.42879681, 0.93728747, 0.922984  ,
       0.34969106])

In [33]:
seasons.lookup([1, 4], ['spring', 'winter'])

array([0.59180906, 0.68903012])

In [34]:
seasons.lookup([1, 1], ['spring', 'summer'])

array([0.59180906, 0.14843728])

In [35]:
# dict.get(key, default = None)

In [36]:
seasons.get(seasons.spring > 0.5) == seasons[seasons.spring > 0.5]

Unnamed: 0,winter,spring,summer,autumn
1,True,True,True,True
4,True,True,True,True
5,True,True,True,True


# 3) RangeIndex, .index, set_index(), reindex()

In [37]:
df = pd.read_csv("./your-code/Admisssion_Predict.csv", sep=',')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,316,104,3,3.0,3.5,8.0,1,0.72
2,3,322,110,3,3.5,2.5,8.67,1,0.8
3,4,314,103,2,2.0,3.0,8.21,0,0.65
4,5,330,115,5,4.5,3.0,9.34,1,0.9


In [38]:
df.index

RangeIndex(start=0, stop=385, step=1)

In [39]:
from pandas import RangeIndex
df.index = RangeIndex(start=0, stop=385 * 2, step=2)
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
2,2,316,104,3,3.0,3.5,8.0,1,0.72
4,3,322,110,3,3.5,2.5,8.67,1,0.8
6,4,314,103,2,2.0,3.0,8.21,0,0.65
8,5,330,115,5,4.5,3.0,9.34,1,0.9


In [40]:
df.index = df['Serial No.']
df.head()

Unnamed: 0_level_0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,337,118,4,4.5,4.5,9.65,1,0.92
2,2,316,104,3,3.0,3.5,8.0,1,0.72
3,3,322,110,3,3.5,2.5,8.67,1,0.8
4,4,314,103,2,2.0,3.0,8.21,0,0.65
5,5,330,115,5,4.5,3.0,9.34,1,0.9


In [41]:
df.drop('Serial No.', axis=1)

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
5,330,115,5,4.5,3.0,9.34,1,0.90
...,...,...,...,...,...,...,...,...
381,324,110,3,3.5,3.5,9.04,1,0.82
382,325,107,3,3.0,3.5,9.11,1,0.84
383,330,116,4,5.0,4.5,9.45,1,0.91
384,312,103,3,3.5,4.0,8.78,0,0.67


In [42]:
df.set_index('Research', inplace=True)
df

Unnamed: 0_level_0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Chance of Admit
Research,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,337,118,4,4.5,4.5,9.65,0.92
1,2,316,104,3,3.0,3.5,8.00,0.72
1,3,322,110,3,3.5,2.5,8.67,0.80
0,4,314,103,2,2.0,3.0,8.21,0.65
1,5,330,115,5,4.5,3.0,9.34,0.90
...,...,...,...,...,...,...,...,...
1,381,324,110,3,3.5,3.5,9.04,0.82
1,382,325,107,3,3.0,3.5,9.11,0.84
1,383,330,116,4,5.0,4.5,9.45,0.91
0,384,312,103,3,3.5,4.0,8.78,0.67


In [43]:
(df['GRE Score'] + df['TOEFL Score']).value_counts()

416    14
417    13
432    12
424    11
430    11
       ..
445     1
449     1
398     1
441     1
387     1
Length: 71, dtype: int64

In [44]:
df.drop_duplicates(subset=['GRE Score', 'TOEFL Score'], inplace=True)
df

Unnamed: 0_level_0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Chance of Admit
Research,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,337,118,4,4.5,4.5,9.65,0.92
1,2,316,104,3,3.0,3.5,8.00,0.72
1,3,322,110,3,3.5,2.5,8.67,0.80
0,4,314,103,2,2.0,3.0,8.21,0.65
1,5,330,115,5,4.5,3.0,9.34,0.90
...,...,...,...,...,...,...,...,...
1,375,320,108,3,3.5,4.0,8.44,0.76
0,376,314,102,2,2.0,2.5,8.24,0.64
1,382,325,107,3,3.0,3.5,9.11,0.84
0,384,312,103,3,3.5,4.0,8.78,0.67


In [45]:
(df['GRE Score'] + df['TOEFL Score']).value_counts()

417    11
416     9
409     7
431     7
436     7
       ..
445     1
447     1
449     1
456     1
387     1
Length: 71, dtype: int64

In [46]:
df.reset_index(inplace=True)
df

Unnamed: 0,Research,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Chance of Admit
0,1,1,337,118,4,4.5,4.5,9.65,0.92
1,1,2,316,104,3,3.0,3.5,8.00,0.72
2,1,3,322,110,3,3.5,2.5,8.67,0.80
3,0,4,314,103,2,2.0,3.0,8.21,0.65
4,1,5,330,115,5,4.5,3.0,9.34,0.90
...,...,...,...,...,...,...,...,...,...
267,1,375,320,108,3,3.5,4.0,8.44,0.76
268,0,376,314,102,2,2.0,2.5,8.24,0.64
269,1,382,325,107,3,3.0,3.5,9.11,0.84
270,0,384,312,103,3,3.5,4.0,8.78,0.67


In [47]:
new_index = [12321] + list(df.index)
df_reindexed = df.reindex(new_index)
df_reindexed.head()

Unnamed: 0,Research,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Chance of Admit
12321,,,,,,,,,
0,1.0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,0.92
1,1.0,2.0,316.0,104.0,3.0,3.0,3.5,8.0,0.72
2,1.0,3.0,322.0,110.0,3.0,3.5,2.5,8.67,0.8
3,0.0,4.0,314.0,103.0,2.0,2.0,3.0,8.21,0.65


In [48]:
df.set_index(['GRE Score', 'TOEFL Score'], inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Research,Serial No.,University Rating,SOP,LOR,CGPA,Chance of Admit
GRE Score,TOEFL Score,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
337,118,1,1,4,4.5,4.5,9.65,0.92
316,104,1,2,3,3.0,3.5,8.0,0.72
322,110,1,3,3,3.5,2.5,8.67,0.8
314,103,0,4,2,2.0,3.0,8.21,0.65
330,115,1,5,5,4.5,3.0,9.34,0.9


In [49]:
df = pd.DataFrame(np.random.randn(3, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,-0.425638,-0.43391,-1.615923,0.455917
1,-1.741086,0.387483,-0.473379,2.442962
2,-0.244601,0.205711,2.035249,0.072563


In [50]:
df.set_index('A', inplace=True)
df

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.425638,-0.43391,-1.615923,0.455917
-1.741086,0.387483,-0.473379,2.442962
-0.244601,0.205711,2.035249,0.072563


In [51]:
new_index = list(df.index) + [0.5]

In [52]:
df = df.reindex(new_index)
df

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.425638,-0.43391,-1.615923,0.455917
-1.741086,0.387483,-0.473379,2.442962
-0.244601,0.205711,2.035249,0.072563
0.5,,,


# 4) Method Chaining

In [53]:
ks = pd.read_csv('../../../_LabDATA/ks-projects-201801.csv')
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [54]:
ks.shape

(378661, 15)

In [55]:
ks.country.value_counts()

US      292627
GB       33672
CA       14756
AU        7839
DE        4171
N,0"      3797
FR        2939
IT        2878
NL        2868
ES        2276
SE        1757
MX        1752
NZ        1447
DK        1113
IE         811
CH         761
NO         708
HK         618
BE         617
AT         597
SG         555
LU          62
JP          40
Name: country, dtype: int64

In [56]:
ks.isna().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [57]:
ks.dropna(inplace=True)
ks.shape

(374860, 15)

In [58]:
ks.isna().sum()

ID                  0
name                0
category            0
main_category       0
currency            0
deadline            0
goal                0
launched            0
pledged             0
state               0
backers             0
country             0
usd pledged         0
usd_pledged_real    0
usd_goal_real       0
dtype: int64

In [59]:
ks.drop(columns=['usd pledged'], inplace=True)
ks.replace('N,0"', 'Unknown', inplace=True)

In [60]:
ks = pd.read_csv('../../../_LabDATA/ks-projects-201801.csv')
ks = (ks.drop(columns=['usd pledged'])
   .replace('N,0"', 'Unknown')
   .head())

In [61]:
ks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                5 non-null      int64  
 1   name              5 non-null      object 
 2   category          5 non-null      object 
 3   main_category     5 non-null      object 
 4   currency          5 non-null      object 
 5   deadline          5 non-null      object 
 6   goal              5 non-null      float64
 7   launched          5 non-null      object 
 8   pledged           5 non-null      float64
 9   state             5 non-null      object 
 10  backers           5 non-null      int64  
 11  country           5 non-null      object 
 12  usd_pledged_real  5 non-null      float64
 13  usd_goal_real     5 non-null      float64
dtypes: float64(4), int64(2), object(8)
memory usage: 688.0+ bytes


In [62]:
ks.assign(dollar_per_backer = ks.usd_pledged_real / ks.backers,
          duration = pd.to_datetime(ks.deadline) - pd.to_datetime(ks.launched))
ks.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,19500.0


In [1]:
def column_handling(df):
    df.columns = (df.columns.str.replace(" ", "_").
                                         str.replace("[\W]", "").
                                         str.lower())
    return df

def adjust_gre(df):
    df.query('university_rating > 4').gre_score += 10
    return df.assign(adjusted_gre=pd.cut(df.gre_score, 4, labels=['bad', 'medium', 'good', 'excellent']))

In [11]:
admissions = pd.read_csv('./your-code/Admisssion_Predict.csv')
admissions

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,316,104,3,3.0,3.5,8.00,1,0.72
2,3,322,110,3,3.5,2.5,8.67,1,0.80
3,4,314,103,2,2.0,3.0,8.21,0,0.65
4,5,330,115,5,4.5,3.0,9.34,1,0.90
...,...,...,...,...,...,...,...,...,...
380,381,324,110,3,3.5,3.5,9.04,1,0.82
381,382,325,107,3,3.0,3.5,9.11,1,0.84
382,383,330,116,4,5.0,4.5,9.45,1,0.91
383,384,312,103,3,3.5,4.0,8.78,0,0.67


In [13]:
column_handling(admissions).pipe(adjust_gre)
print(admissions)

     serial_no  gre_score  toefl_score  university_rating  sop  lor_  cgpa  \
0            1        337          118                  4  4.5   4.5  9.65   
1            2        316          104                  3  3.0   3.5  8.00   
2            3        322          110                  3  3.5   2.5  8.67   
3            4        314          103                  2  2.0   3.0  8.21   
4            5        330          115                  5  4.5   3.0  9.34   
..         ...        ...          ...                ...  ...   ...   ...   
380        381        324          110                  3  3.5   3.5  9.04   
381        382        325          107                  3  3.0   3.5  9.11   
382        383        330          116                  4  5.0   4.5  9.45   
383        384        312          103                  3  3.5   4.0  8.78   
384        385        333          117                  4  5.0   4.0  9.66   

     research  chance_of_admit_  
0           1              0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [14]:
admissions.columns = (admissions.columns.str.replace(" ", "_").
                                         str.replace("[\W]", "").
                                         str.capitalize())
admissions

Unnamed: 0,Serial_no,Gre_score,Toefl_score,University_rating,Sop,Lor_,Cgpa,Research,Chance_of_admit_
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,316,104,3,3.0,3.5,8.00,1,0.72
2,3,322,110,3,3.5,2.5,8.67,1,0.80
3,4,314,103,2,2.0,3.0,8.21,0,0.65
4,5,330,115,5,4.5,3.0,9.34,1,0.90
...,...,...,...,...,...,...,...,...,...
380,381,324,110,3,3.5,3.5,9.04,1,0.82
381,382,325,107,3,3.0,3.5,9.11,1,0.84
382,383,330,116,4,5.0,4.5,9.45,1,0.91
383,384,312,103,3,3.5,4.0,8.78,0,0.67


In [None]:
# Pipe & assign

In [67]:
timeline = pd.read_csv('data/711598520_T_ONTIME.csv')
timeline

FileNotFoundError: [Errno 2] No such file or directory: 'data/711598520_T_ONTIME.csv'

In [None]:
timeline.info()

In [None]:
def read(fp):
    df = (pd.read_csv(fp)
            .rename(columns=str.lower)
            .pipe(extract_city_name)
            .pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time'])
            .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']),
                    dest=lambda x: pd.Categorical(x['dest']),
                    origin=lambda x: pd.Categorical(x['origin']),
                    tail_num=lambda x: pd.Categorical(x['tail_num']),
                    unique_carrier=lambda x: pd.Categorical(x['unique_carrier']),
                    cancellation_code=lambda x: pd.Categorical(x['cancellation_code'])))
    return df

def extract_city_name(df):
    '''
    Chicago, IL -> Chicago for origin_city_name and dest_city_name
    '''
    cols = ['origin_city_name', 'dest_city_name']
    city = df[cols].apply(lambda x: x.str.extract("(.*), \w{2}", expand=False))
    df = df.copy()
    df[['origin_city_name', 'dest_city_name']] = city
    return df

def time_to_datetime(df, columns):
    '''
    Combine all time items into datetimes.

    2014-01-01,0914 -> 2014-01-01 09:14:00
    '''
    df = df.copy()
    def converter(col):
        timepart = (col.astype(str)
                       .str.replace('\.0$', '')  # NaNs force float dtype
                       .str.pad(4, fillchar='0'))
        return pd.to_datetime(df['fl_date'] + ' ' +
                               timepart.str.slice(0, 2) + ':' +
                               timepart.str.slice(2, 4),
                               errors='coerce')
    df[columns] = df[columns].apply(converter)
    return df

output = 'data/flights.h5'

if not os.path.exists(output):
    df = read("data/711598520_T_ONTIME.csv")
    df.to_hdf(output, 'flights', format='table')
else:
    df = pd.read_hdf(output, 'flights', format='table')
df.info()
df