In [None]:
'''
Python for Data Analysis
Quality Checking and Infilling

Graeme Hawker, University of Strathclyde
2018-07-23
'''

import numpy as np
import pandas as pd

In [None]:
#create a DataFrame from an existing .csv file
turbine_data_bad = pd.read_csv('power_curve_data_BAD.csv', index_col=0, parse_dates=True)
turbine_data_bad.head()

In [None]:
#show missing entries - rows where any values are missing from any column
turbine_data_bad[turbine_data_bad.isnull().any(axis=1)]

In [None]:
#infill missing values using forward propagation
#note that we can filter the values in one DataFrame by those in another
#use this here to show the infilled values based on the missing values in the original
turbine_data_infilled = turbine_data_bad.fillna(method='pad')
turbine_data_infilled[turbine_data_bad.isnull().any(axis=1)]

In [None]:
#show rows where the wind speed is above a credible number (e.g. 100m/s)
turbine_data_infilled.query('Windspeed > 100')

In [None]:
#create a new Series of Boolean values which identifies these dubious rows
turbine_data_infilled['bad_wind_value'] = (turbine_data_infilled['Windspeed'] >100)
turbine_data_infilled.head()

In [None]:
#create a new Windspeed column with bad values replaced with NaN
turbine_data_infilled['new_Windspeed'] = turbine_data_infilled['Windspeed'][turbine_data_infilled.bad_wind_value==False]
turbine_data_infilled[turbine_data_infilled.bad_wind_value==True]

In [None]:
turbine_data_infilled['new_Windspeed'].interpolate()

In [None]:
#replace bad values with interpolated ones
turbine_data_infilled['new_Windspeed'] = turbine_data_infilled['new_Windspeed'].interpolate()
turbine_data_infilled[turbine_data_infilled.bad_wind_value==True]