In [1]:
'''
Python for Data Analysis
Inbuilt DataFrame statistics examples

Graeme Hawker, University of Strathclyde
2018-07-23
'''

import numpy as np
import pandas as pd

In [2]:
#create a DataFrame from an existing .csv file
#we indicate that the first column of the file is an index rather than data, and contains datetime objects
#this means that the function will attempt to construct a DatetimeIndex from this column
turbine_data = pd.read_csv('power_curve_data.csv', index_col=0, parse_dates=True)

#inspect first 5 rows
turbine_data.head()

Unnamed: 0_level_0,Windspeed,Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,8.21859,0.527433
2010-01-01 00:10:00,10.0755,0.698405
2010-01-01 00:20:00,10.1849,0.642705
2010-01-01 00:30:00,8.56266,0.534949
2010-01-01 00:40:00,8.93533,0.394143


In [3]:
#use inbuilt DataFrame functions to look at aggregate statistics
#max value in each column
turbine_data.max()

Windspeed    17.756001
Power         0.983773
dtype: float64

In [4]:
#mean of each column
turbine_data.mean()

Windspeed    5.894073
Power        0.213854
dtype: float64

In [7]:
#mean of each row (meaningless example)
#only look at first 5 rows
turbine_data.mean(axis=1).head()

Timestamp
2010-01-01 00:00:00    4.373011
2010-01-01 00:10:00    5.386952
2010-01-01 00:20:00    5.413803
2010-01-01 00:30:00    4.548805
2010-01-01 00:40:00    4.664737
dtype: float64

In [14]:
#use of aggregation
#create a rolling window (we pass this as a new object)
rolling_data = turbine_data.rolling(window=60,min_periods=1)
rolling_data

Rolling [window=60,min_periods=1,center=False,axis=0]

In [15]:
#calculate rolling mean by passing np.mean function
rolling_data.aggregate(np.mean).head()

Unnamed: 0_level_0,Windspeed,Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,8.21859,0.527433
2010-01-01 00:10:00,9.147045,0.612919
2010-01-01 00:20:00,9.492997,0.622848
2010-01-01 00:30:00,9.260412,0.600873
2010-01-01 00:40:00,9.195396,0.559527


In [17]:
#calculate multiple values for a single column
rolling_data['Windspeed'].agg([np.sum, np.mean, np.std]).head()

Unnamed: 0_level_0,sum,mean,std
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-01 00:00:00,8.21859,8.21859,
2010-01-01 00:10:00,18.294089,9.147045,1.313033
2010-01-01 00:20:00,28.47899,9.492997,1.105023
2010-01-01 00:30:00,37.04165,9.260412,1.015102
2010-01-01 00:40:00,45.97698,9.195396,0.891044
