In [1]:
import pandas as pd
import os
import re
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import shutil
from scipy import stats
from scipy.stats import norm, skew, f, ttest_ind, ttest_rel, mannwhitneyu, levene, bartlett
import requests
import random
import researchpy as rp
import numpy as np
from sklearn.svm import OneClassSVM
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
# Adjust path to data in local machine
inputDataPath = "/Users/qilu/Desktop/Advanticsys/Data/Input_Data/"
outputDataPath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/"


In [4]:
inverters = {
    'inv01':'wms01',
    'inv02':'wms01',
    'inv03':'wms01',
    'inv04':'wms01',
    'inv05':'wms01',
    'inv06':'wms02',
    'inv07':'wms02',
    'inv08':'wms02',
    'inv09':'wms02',
    'inv10':'wms02',
}

In [5]:
def dataCleansing(year, inv):
       
    # locate raw data files, create folders for each year and save raw data files in matched year folder 
    yearlyDataPath = inputDataPath  + str(year) + '/'
    filenames = os.listdir(yearlyDataPath)
    
    
    for file in filenames:   
        # find files include today_energy data
        inv_today_match = re.search(re.escape(inv) + r'_today_energy', file)    
        if inv_today_match:
            # convert today_energy data into pandas dataframe
            df_today_energy = pd.read_csv(yearlyDataPath+file,delimiter=';',index_col=None, header=0)
            
            # calculate energy generation for each minute
            df_today_energy['Time_Diff'] = df_today_energy['Timestamp'].diff()
            df_today_energy['Energy_Diff'] = df_today_energy['Value'].diff()  
            
            # keep the timestamp with positive energy reading 
            df_today_energy = df_today_energy[(df_today_energy[['Time_Diff']] == 60000).all(axis=1)]            
            df_today_energy = df_today_energy[(df_today_energy[['Energy_Diff']] > 0).all(axis=1)]              
        
            df_today_energy = df_today_energy.drop(['Time_Diff'], axis=1)  
            df_today_energy = df_today_energy.drop(['Value'], axis=1)              
            
            df_today_energy.rename(columns={"Energy_Diff": "Energy", "Signal": "Inverter"}, inplace=True)
            df_today_energy['Inverter'] = df_today_energy['Inverter'].str.split('_').str[0]
            
            # remove outliers in today_energy data as all other data is relate to energy production in each minute
            todayEnergyQ1 = df_today_energy.Energy.quantile(0.25)
            todayEnergyQ3 = df_today_energy.Energy.quantile(0.75)
            todayEnergyUpper = todayEnergyQ3 + 1.5*(todayEnergyQ3-todayEnergyQ1)
            df_today_energy = df_today_energy[(df_today_energy[['Energy']] < todayEnergyUpper).all(axis=1)]
            
    for file in filenames:         
        # find files include total_energy data
        inv_total_match = re.search(re.escape(inv) + r'_total_energy', file) 
        # convert total_energy data into pandas dataframe
        if inv_total_match:
            df_total_energy = pd.read_csv(yearlyDataPath+file,delimiter=';',index_col=None, header=0)
            
            # find total energy reading for each minute            
            df_total_energy['Time_Diff'] = df_today_energy['Timestamp'].diff()
            df_total_energy = df_total_energy[(df_total_energy[['Time_Diff']] == 60000).all(axis=1)]   
            df_total_energy = df_total_energy.drop(['Date', 'Signal', 'Time_Diff'], axis=1)  
            
            df_total_energy.rename(columns={"Value": "Total_Energy"}, inplace=True)

    for file in filenames:   
        # find files include inverter temperature data
        inv_temp_match = re.search(re.escape(inv) + r'_temp', file) 
        # convert inverter temperature data into pandas dataframe
        if inv_temp_match:
            df_inv_temp = pd.read_csv(yearlyDataPath+file,delimiter=';',index_col=None, header=0)
            # divide raw data by 10 to get temperature in degree
            df_inv_temp.loc[:, 'Value'] = df_inv_temp.Value.apply(lambda x: x/10) 
            
            # find inverter temperature recording for each minute            
            df_inv_temp['Time_Diff'] = df_inv_temp['Timestamp'].diff()
            df_inv_temp = df_inv_temp[(df_inv_temp[['Time_Diff']] == 60000).all(axis=1)]  
            df_inv_temp = df_inv_temp.drop(['Date', 'Signal', 'Time_Diff'], axis=1)  
            
            df_inv_temp.rename(columns={"Value": "Inv_Temp"}, inplace=True)            
                      
            
    for file in filenames:     
        # find files include wms temperature data for matched reporting inverter
        # inverter 01 to 05 is under wms01 and inverter 06 to 10 is under wms02
        wms_temp_match = re.search(re.escape(inverters[inv]) + r'_temp', file)    
        if wms_temp_match:
            # convert wms temperature data into pandas dataframe            
            df_wms_temp = pd.read_csv(yearlyDataPath+file,delimiter=';',index_col=None, header=0)
            # divide raw data by 10 to get temperature in degree            
            df_wms_temp.loc[:, 'Value'] = df_wms_temp.Value.apply(lambda x: x/10) 
            
            # find wms temperature recording for each minute            
            df_wms_temp['Time_Diff'] = df_wms_temp['Timestamp'].diff()
            df_wms_temp = df_wms_temp[(df_wms_temp[['Time_Diff']] == 60000).all(axis=1)]              
            df_wms_temp = df_wms_temp.drop(['Date', 'Signal', 'Time_Diff'], axis=1)  
            
            df_wms_temp.rename(columns={"Value": "Wms_Temp"}, inplace=True)                 
      
            
    for file in filenames:
        # find files include wms irradiance data for matched reporting inverter
        # inverter 01 to 05 is under wms01 and inverter 06 to 10 is under wms02
        wms_irr_match = re.search(re.escape(inverters[inv]) + r'_irradiance', file)    
        if wms_irr_match:
            df_wms_irr = pd.read_csv(yearlyDataPath+file,delimiter=';',index_col=None, header=0)
            
            # find wms irradiance recording for each minute            
            df_wms_irr['Time_Diff'] = df_wms_irr['Timestamp'].diff()
            df_wms_irr = df_wms_irr[(df_wms_irr[['Time_Diff']] == 60000).all(axis=1)]  
            df_wms_irr = df_wms_irr.drop(['Date', 'Signal', 'Time_Diff'], axis=1)  
            
            df_wms_irr.rename(columns={"Value": "Wms_Irr"}, inplace=True)               
            
            

    
    # merge all data into one dataframe based on timestampe in today energy data to make sure related weather data allocated to correct time
    
    
    df = pd.merge(df_today_energy,
                  df_total_energy,
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_inv_temp[['Timestamp', 'Inv_Temp']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_wms_temp[['Timestamp', 'Wms_Temp']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_wms_irr[['Timestamp', 'Wms_Irr']],
                  on='Timestamp')    
    
    # remove outliers/data errors to make sure we are using corret data for statistical analysis and ML prediction
    # keep the data with positive total energy reading
    df = df[(df[['Total_Energy']] > 0).all(axis=1)]        
    # remove total energy outliers above upper boundaries
    totalEnergyQ1 = df.Total_Energy.quantile(0.25)
    totalEnergyQ3 = df.Total_Energy.quantile(0.75)
    totalEnergyUpper = totalEnergyQ3 + 1.5*(totalEnergyQ3-totalEnergyQ1)
    df = df[(df[['Total_Energy']] < totalEnergyUpper).all(axis=1)]     
    
    # keep the data with positive inverter temp
    df = df[(df[['Inv_Temp']] > 0).all(axis=1)]     
    # only keep temp value between boundaries
    invTempQ1 = df.Inv_Temp.quantile(0.25)
    invTempQ3 = df.Inv_Temp.quantile(0.75)
    invTempUpper = invTempQ3 + 1.5*(invTempQ3-invTempQ1)
    invTempLower = invTempQ1 - 1.5*(invTempQ3-invTempQ1)           
    df = df[((df[['Inv_Temp']] > invTempLower)
                              & (df[['Inv_Temp']] < invTempUpper)).all(axis=1)]   
    
    # keep the data with positive wms temp
    df = df[(df[['Wms_Temp']] > 0).all(axis=1)]       
    # only keep temp value between boundaries        
    wmsTempQ1 = df.Wms_Temp.quantile(0.25)
    wmsTempQ3 = df.Wms_Temp.quantile(0.75)
    wmsTempUpper = wmsTempQ3 + 1.5*(wmsTempQ3-wmsTempQ1)
    wmsTempLower = wmsTempQ1 - 1.5*(wmsTempQ3-wmsTempQ1)           
    df = df[((df[['Wms_Temp']] > wmsTempLower)
                              & (df[['Wms_Temp']] < wmsTempUpper)).all(axis=1)]    
    
    # keep the data with positive wms irradiance
    df = df[(df[['Wms_Irr']] > 0).all(axis=1)]     
    # only keep irradiance value between boundaries
    # extreme low irradiance has no contribution to energy production
    # extreme high irradiance is likely to be error as irradiance can only within an achieveable range
    wmsIrrQ1 = df.Wms_Irr.quantile(0.25)
    wmsIrrQ3 = df.Wms_Irr.quantile(0.75)
    wmsIrrUpper = wmsIrrQ3 + 1.5*(wmsIrrQ3-wmsIrrQ1)
    wmsIrrLower = wmsIrrQ1 - 1.5*(wmsIrrQ3-wmsIrrQ1)           
    df = df[((df[['Wms_Irr']] > wmsIrrLower)
                              & (df[['Wms_Irr']] < wmsIrrUpper)).all(axis=1)]     
    
    return df
    

In [6]:
# setup formula to combine clean data for each inverter into one single dataframe
def combineCleanDataForAllInvters(year):
    df_clean = pd.DataFrame()
    invNo = ['inv01',
             'inv02',
             'inv03',
             'inv04',
             'inv05',
             'inv06',
             'inv07',
             'inv08',
             'inv09',
             'inv10']
    for inv in invNo:
        df_inv = (dataCleansing(year, inv))
        
        df_clean = df_clean.append(df_inv, ignore_index = True)    

  
    return df_clean

In [7]:
# function to save clean data for each yaer
def saveCleanData(year):
    df_clean = combineCleanDataForAllInvters(year)
    df_clean.to_csv(outputDataPath + str(year) + '_clean_data_by_minute_for_prediction.csv', index=False)
    return

In [8]:
saveCleanData(2020)

In [9]:
saveCleanData(2021)

In [10]:
df_combined_clean_data = pd.DataFrame()

# find processed data for all years
filenames = os.listdir(outputDataPath)


for file in filenames:   
    # find files include data for all years
    year_data_match = re.search(r'by_minute', file)  
    if year_data_match:
        df_year = pd.read_csv(outputDataPath + file,index_col=None, header=0)
        df_combined_clean_data = df_combined_clean_data.append(df_year, ignore_index = True)  

# save all year combined data into output data folder for reporting       
df_combined_clean_data.to_csv(outputDataPath + 'combined_clean_data_by_minute.csv', index=False)


    

In [11]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(df_combined_clean_data['Energy'], palette="Set1");
# plt.title('Boxplot for Energy by Minute', fontsize = 15)
# plt.xlabel('Energy (kWh)')
# plt.show();

In [12]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(df_combined_clean_data['Total_Energy'], palette="Set1");
# plt.title('Boxplot for Total Energy Reading', fontsize = 15)
# plt.xlabel('Total Energy (mWh)')
# plt.show();

In [13]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(df_combined_clean_data['Inv_Temp'], palette="Set1");
# plt.title('Boxplot for Inverter Temperature', fontsize = 15)
# plt.xlabel('Degree')
# plt.show();

In [14]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(df_combined_clean_data['Wms_Temp'], palette="Set1");
# plt.title('Boxplot for Wms Temperature', fontsize = 15)
# plt.xlabel('Degree')
# plt.show();

In [15]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(df_combined_clean_data['Wms_Irr'], palette="Set1");
# plt.title('Boxplot for Wms Irradiance', fontsize = 15)
# plt.xlabel('w/ms')
# plt.show();