In [1]:
import pandas as pd
import numpy as np


In [2]:
data = np.random.randn(5)
s = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
s
#data series is one dimensional but each row has labels

a   -1.366413
b   -2.195704
c   -1.741806
d   -0.720754
e    1.345214
dtype: float64

In [3]:
time = pd.date_range('2013-01-28', periods=6)

data = np.random.randn(6, 4)

df = pd.DataFrame(data, index=time, columns=list('ABCD'))

df
#data frames are 2d, therefore both the columns and the rows have indexes
#let A,B... be places for example 

Unnamed: 0,A,B,C,D
2013-01-28,-0.284826,0.03571,0.31211,0.972868
2013-01-29,0.166682,-0.142313,-0.31563,-0.77697
2013-01-30,-2.011938,0.313104,0.229569,1.458476
2013-01-31,-0.613562,1.414956,0.793555,0.358823
2013-02-01,0.331422,0.755886,-1.356311,1.364642
2013-02-02,-0.189642,-1.072263,0.328873,0.288739


In [6]:
df['A'] #selecting a column as a data series, output rows have index (time here)
df[['A','B']] #selecting a column as a dataframe, output columns have index TOO

Unnamed: 0,A,B
2013-01-28,-0.284826,0.03571
2013-01-29,0.166682,-0.142313
2013-01-30,-2.011938,0.313104
2013-01-31,-0.613562,1.414956
2013-02-01,0.331422,0.755886
2013-02-02,-0.189642,-1.072263


In [7]:
#selecting split rows 
df.loc['2013-01-30']

A   -2.011938
B    0.313104
C    0.229569
D    1.458476
Name: 2013-01-30 00:00:00, dtype: float64

In [8]:
df.loc['2013-01-30':'2013-01-30'] #as a range

Unnamed: 0,A,B,C,D
2013-01-30,-2.011938,0.313104,0.229569,1.458476


In [12]:
df.loc['2013-01'] #all days in a month

Unnamed: 0,A,B,C,D
2013-01-28,-0.284826,0.03571,0.31211,0.972868
2013-01-29,0.166682,-0.142313,-0.31563,-0.77697
2013-01-30,-2.011938,0.313104,0.229569,1.458476
2013-01-31,-0.613562,1.414956,0.793555,0.358823


In [13]:
#select by position 
df.iloc[:3] #first 3 element down

Unnamed: 0,A,B,C,D
2013-01-28,-0.284826,0.03571,0.31211,0.972868
2013-01-29,0.166682,-0.142313,-0.31563,-0.77697
2013-01-30,-2.011938,0.313104,0.229569,1.458476


In [15]:
#item assignment possible
df.iloc[3, 1] = np.NaN

df.loc['2013-02', 'D'] = np.NaN
df

Unnamed: 0,A,B,C,D
2013-01-28,-0.284826,0.03571,0.31211,0.972868
2013-01-29,0.166682,-0.142313,-0.31563,-0.77697
2013-01-30,-2.011938,0.313104,0.229569,1.458476
2013-01-31,-0.613562,,0.793555,0.358823
2013-02-01,0.331422,0.755886,-1.356311,
2013-02-02,-0.189642,-1.072263,0.328873,


In [22]:
df.mean() #per row - on axis 0 (thru time)
df.mean(axis=1)

2013-01-28    0.258965
2013-01-29   -0.267058
2013-01-30   -0.002697
2013-01-31    0.179606
2013-02-01   -0.089668
2013-02-02   -0.311011
Freq: D, dtype: float64

In [17]:
#nan values not included unless:
df.mean(skipna=False)

A   -0.433644
B         NaN
C   -0.001306
D         NaN
dtype: float64

In [23]:
df.loc['2013-01-28', 'E'] = 5
df

Unnamed: 0,A,B,C,D,E
2013-01-28,-0.284826,0.03571,0.31211,0.972868,5.0
2013-01-29,0.166682,-0.142313,-0.31563,-0.77697,
2013-01-30,-2.011938,0.313104,0.229569,1.458476,
2013-01-31,-0.613562,,0.793555,0.358823,
2013-02-01,0.331422,0.755886,-1.356311,,
2013-02-02,-0.189642,-1.072263,0.328873,,


In [24]:
df.loc[:,'F']=2
df

Unnamed: 0,A,B,C,D,E,F
2013-01-28,-0.284826,0.03571,0.31211,0.972868,5.0,2
2013-01-29,0.166682,-0.142313,-0.31563,-0.77697,,2
2013-01-30,-2.011938,0.313104,0.229569,1.458476,,2
2013-01-31,-0.613562,,0.793555,0.358823,,2
2013-02-01,0.331422,0.755886,-1.356311,,,2
2013-02-02,-0.189642,-1.072263,0.328873,,,2


In [25]:
df.loc['2013-02' , :].std() #say yu want std of february in each location

A    0.368448
B    1.292696
C    1.191605
D         NaN
E         NaN
F    0.000000
dtype: float64

In [26]:
fN = './../data/co2.csv'

with open(fN) as fid:
    for i in range(10):
        print(fid.readline(), end='')

Mauna Loa Weekly Atmospheric CO2 Data
 - units: ppm
 - Keeling & Whorf 2004
 - Obtained from statsmodels
 - http://www.statsmodels.org/devel/datasets/generated/co2.html
,co2
1958-03-29,316.1
1958-04-05,317.3
1958-04-12,317.6
1958-04-19,317.5


In [30]:
#as can be seen we have 6 head lines - thus for python skip 5 (count from 0)
CO2_prelim=pd.read_csv(fN,header=5)
CO2_prelim.head(3)

Unnamed: 0.1,Unnamed: 0,co2
0,1958-03-29,316.1
1,1958-04-05,317.3
2,1958-04-12,317.6


In [32]:
print('The first date "{:s}"'.format(CO2_prelim.iloc[0, 0]))
print('The type of the date', type(CO2_prelim.iloc[0, 0]))
#check how the date format and type is assembled - its a string 

The first date "1958-03-29"
The type of the date <class 'str'>


In [33]:
# We want the labels to be the (time) axis, and not numbers. Therefore we have to choose index_col=0, which uses the first column as index.
# We also want the time axis to be a pandas DateTimeIndex, and not a string. Thus, we use parse_dates=True.

In [34]:
co2=pd.read_csv(fN, header=5, index_col=0, parse_dates=True) 
#make dates as datetimeindex

In [35]:
co2.head(3)

Unnamed: 0,co2
1958-03-29,316.1
1958-04-05,317.3
1958-04-12,317.6


In [36]:
co2_monthly = co2.resample('M').mean()

co2_monthly.head()
#now that dates are recognized, you can resample data into monthly from daily

Unnamed: 0,co2
1958-03-31,316.1
1958-04-30,317.2
1958-05-31,317.433333
1958-06-30,
1958-07-31,315.625


In [45]:
pd.date_range('2000-01-01', '2000-12-31')
# every second month of the year 2000
pd.date_range('2000-01-01', '2000-12-31', freq='2M')
pd.date_range('2000-01-01','2000-05-31',freq='30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:30:00',
               '2000-01-01 01:00:00', '2000-01-01 01:30:00',
               '2000-01-01 02:00:00', '2000-01-01 02:30:00',
               '2000-01-01 03:00:00', '2000-01-01 03:30:00',
               '2000-01-01 04:00:00', '2000-01-01 04:30:00',
               ...
               '2000-05-30 19:30:00', '2000-05-30 20:00:00',
               '2000-05-30 20:30:00', '2000-05-30 21:00:00',
               '2000-05-30 21:30:00', '2000-05-30 22:00:00',
               '2000-05-30 22:30:00', '2000-05-30 23:00:00',
               '2000-05-30 23:30:00', '2000-05-31 00:00:00'],
              dtype='datetime64[ns]', length=7249, freq='30T')

In [48]:
co2_annual=co2.resample('a').mean()
co2_annual.head()

Unnamed: 0,co2
1958-12-31,315.42
1959-12-31,315.90625
1960-12-31,316.860377
1961-12-31,317.592308
1962-12-31,318.545833


In [49]:
df

Unnamed: 0,A,B,C,D,E,F
2013-01-28,-0.284826,0.03571,0.31211,0.972868,5.0,2
2013-01-29,0.166682,-0.142313,-0.31563,-0.77697,,2
2013-01-30,-2.011938,0.313104,0.229569,1.458476,,2
2013-01-31,-0.613562,,0.793555,0.358823,,2
2013-02-01,0.331422,0.755886,-1.356311,,,2
2013-02-02,-0.189642,-1.072263,0.328873,,,2


In [50]:
df.resample('M').mean().head()

Unnamed: 0,A,B,C,D,E,F
2013-01-31,-0.685911,0.068833,0.254901,0.503299,5.0,2
2013-02-28,0.07089,-0.158189,-0.513719,,,2
