In [1]:
'''
Python for Data Analysis
Pandas DataFrame examples

Graeme Hawker, University of Strathclyde
2018-07-23
'''

import numpy as np
import pandas as pd

In [2]:
#create two Series objects in a dictionary
#create DataFrame from the dictionary
#note infilling of missing value with NaN
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [3]:
#get column labels
df.columns

Index(['one', 'two'], dtype='object')

In [4]:
#create a new dataframe from the existing data but with new indices and columns
#note that values are only included if they are already specified by given references
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [7]:
#create a DataFrame from an existing .csv file
#we indicate that the first column of the file is an index rather than data, and contains datetime objects
#this means that the function will attempt to construct a DatetimeIndex from this column
turbine_data = pd.read_csv('data/power_curve_data.csv', index_col=0, parse_dates=True)

#inspect first 5 rows
turbine_data.head()

Unnamed: 0_level_0,Windspeed,Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,8.21859,0.527433
2010-01-01 00:10:00,10.0755,0.698405
2010-01-01 00:20:00,10.1849,0.642705
2010-01-01 00:30:00,8.56266,0.534949
2010-01-01 00:40:00,8.93533,0.394143


In [8]:
#get the size of the DataFrame dimensions
turbine_data.shape

(1008, 2)

In [9]:
#take a slice of the data
turbine_data.loc['2010-01-01 12:00':'2010-01-01 14:00']

Unnamed: 0_level_0,Windspeed,Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 12:00:00,4.87576,0.079004
2010-01-01 12:10:00,5.00155,0.08252
2010-01-01 12:20:00,4.78537,0.094355
2010-01-01 12:30:00,4.78179,0.11581
2010-01-01 12:40:00,4.91424,0.118179
2010-01-01 12:50:00,5.17433,0.119518
2010-01-01 13:00:00,5.54015,0.142589
2010-01-01 13:10:00,5.27745,0.123842
2010-01-01 13:20:00,6.05207,0.159648
2010-01-01 13:30:00,5.348,0.167265


In [10]:
#every 100th value
turbine_data[::100]

Unnamed: 0_level_0,Windspeed,Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-01 00:00:00,8.21859,0.527433
2010-01-01 16:40:00,4.83521,0.161159
2010-02-01 09:20:00,9.97303,0.380674
2010-03-01 02:00:00,3.3786,-0.001417
2010-03-01 18:40:00,0.662917,-0.003416
2010-04-01 11:20:00,4.61683,0.011041
2010-05-01 04:00:00,5.92756,0.17127
2010-05-01 20:40:00,4.93536,0.011704
2010-06-01 13:20:00,5.97641,0.169842
2010-07-01 06:00:00,8.65524,0.490344


In [11]:
#filter for windspeed values greater than 16
turbine_data[turbine_data['Windspeed']>16]

Unnamed: 0_level_0,Windspeed,Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-07-01 21:10:00,17.614401,0.983727
2010-07-01 21:20:00,17.756001,0.983737
2010-07-01 21:30:00,16.7509,0.983765
2010-07-01 22:30:00,16.189699,0.983626
2010-07-01 22:50:00,16.403799,0.982808
2010-07-01 23:00:00,16.0282,0.979286
