In [1]:
import pandas as pd
import os
import re
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import shutil
from scipy import stats
import requests
import random


In [2]:
inverters = {
    'inv01':'wms01',
    'inv02':'wms01',
    'inv03':'wms01',
    'inv04':'wms01',
    'inv05':'wms01',
    'inv06':'wms02',
    'inv07':'wms02',
    'inv08':'wms02',
    'inv09':'wms02',
    'inv10':'wms02',
}

In [3]:
def dataClean(year, inv):
       
    dirpath = "/Users/qilu/Desktop/Advanticsys/Data/" + str(year)
    filenames = os.listdir(dirpath)
    

    for file in filenames:         
        inv_today_match = re.search(re.escape(inv) + r'_today_energy', file)    
        if inv_today_match:
            df_today_energy = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            
            df_today_energy = df_today_energy[(df_today_energy[['Value']] > 0).all(axis=1)]
            df_today_energy.rename(columns={"Value": "Today_Energy", "Signal": "Inverter"}, inplace=True)
            df_today_energy['Inverter'] = df_today_energy['Inverter'].str.split('_').str[0]

            todayEnergyQ1 = df_today_energy.Today_Energy.quantile(0.25)
            todayEnergyQ3 = df_today_energy.Today_Energy.quantile(0.75)
            todayEnergyUpper = todayEnergyQ3 + 1.5*(todayEnergyQ3-todayEnergyQ1)
            df_today_energy = df_today_energy[(df_today_energy[['Today_Energy']] < todayEnergyUpper).all(axis=1)]
            
    for file in filenames:         
        inv_total_match = re.search(re.escape(inv) + r'_total_energy', file)    
        if inv_total_match:
            df_total_energy = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            
            df_total_energy = df_total_energy[(df_total_energy[['Value']] > 0).all(axis=1)]
            df_total_energy.rename(columns={"Value": "Total_Energy", "Signal": "Inverter"}, inplace=True)
            df_total_energy['Inverter'] = df_total_energy['Inverter'].str.split('_').str[0]

    for file in filenames:         
        inv_temp_match = re.search(re.escape(inv) + r'_temp', file)    
        if inv_temp_match:
            df_inv_temp = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            df_inv_temp.loc[:, 'Value'] = df_inv_temp.Value.apply(lambda x: x/10) 
            
            df_inv_temp = df_inv_temp[(df_inv_temp[['Value']] > 0).all(axis=1)]            
            df_inv_temp.rename(columns={"Value": "Inv_Temp", "Signal": "Inverter"}, inplace=True)          
            df_inv_temp['Inverter'] = df_inv_temp['Inverter'].str.split('_').str[0]   
            
    for file in filenames:         
        wms_temp_match = re.search(re.escape(inverters[inv]) + r'_temp', file)    
        if wms_temp_match:
            df_wms_temp = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            df_wms_temp.loc[:, 'Value'] = df_wms_temp.Value.apply(lambda x: x/10) 
            
            df_wms_temp = df_wms_temp[(df_wms_temp[['Value']] > 0).all(axis=1)]           
            df_wms_temp.rename(columns={"Value": "Wms_Temp", "Signal": "Inverter"}, inplace=True)          
            df_wms_temp['Inverter'] = df_wms_temp['Inverter'].str.split('_').str[0]        
            
    for file in filenames:         
        wms_irr_match = re.search(re.escape(inverters[inv]) + r'_irradiance', file)    
        if wms_irr_match:
            df_wms_irr = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            
            df_wms_irr = df_wms_irr[(df_wms_irr[['Value']] > 0).all(axis=1)]                
            df_wms_irr.rename(columns={"Value": "Wms_Irr", "Signal": "Inverter"}, inplace=True)          
            df_wms_irr['Inverter'] = df_wms_irr['Inverter'].str.split('_').str[0]

          
    df = pd.merge(df_today_energy,
                  df_total_energy[['Timestamp', 'Total_Energy']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_inv_temp[['Timestamp', 'Inv_Temp']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_wms_temp[['Timestamp', 'Wms_Temp']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_wms_irr[['Timestamp', 'Wms_Irr']],
                  on='Timestamp')    
    
    
    totalEnergyQ1 = df.Total_Energy.quantile(0.25)
    totalEnergyQ3 = df.Total_Energy.quantile(0.75)
    totalEnergyUpper = totalEnergyQ3 + 1.5*(totalEnergyQ3-totalEnergyQ1)
    df = df[(df[['Total_Energy']] < totalEnergyUpper).all(axis=1)]     
    
    invTempQ1 = df.Inv_Temp.quantile(0.25)
    invTempQ3 = df.Inv_Temp.quantile(0.75)
    invTempUpper = invTempQ3 + 1.5*(invTempQ3-invTempQ1)
    invTempLower = invTempQ1 - 1.5*(invTempQ3-invTempQ1)           
    df = df[((df[['Inv_Temp']] > invTempLower)
                              & (df[['Inv_Temp']] < invTempUpper)).all(axis=1)]        
    
    wmsTempQ1 = df.Wms_Temp.quantile(0.25)
    wmsTempQ3 = df.Wms_Temp.quantile(0.75)
    wmsTempUpper = wmsTempQ3 + 1.5*(wmsTempQ3-wmsTempQ1)
    wmsTempLower = wmsTempQ1 - 1.5*(wmsTempQ3-wmsTempQ1)           
    df = df[((df[['Wms_Temp']] > wmsTempLower)
                              & (df[['Wms_Temp']] < wmsTempUpper)).all(axis=1)]    
    
    wmsIrrQ1 = df.Wms_Irr.quantile(0.25)
    wmsIrrQ3 = df.Wms_Irr.quantile(0.75)
    wmsIrrUpper = wmsIrrQ3 + 1.5*(wmsIrrQ3-wmsIrrQ1)
    wmsIrrLower = wmsIrrQ1 - 1.5*(wmsIrrQ3-wmsIrrQ1)           
    df = df[((df[['Wms_Irr']] > wmsIrrLower)
                              & (df[['Wms_Irr']] < wmsIrrUpper)).all(axis=1)]     
    
    df.loc[:, 'Date'] = pd.to_datetime(df[['Date']].stack()).unstack()
    df.loc[:, 'Only_Date'] = df.Date.apply(lambda x: str(x.date()))
    df.loc[:, 'Hour'] = df.Date.apply(lambda x: int(str(x.time()).split(':')[0]))
    df.loc[:, 'Minute'] = df.Date.apply(lambda x: int(str(x.time()).split(':')[1]))
    df = df.drop(['Date'], axis=1)
    df.rename(columns={"Only_Date": "Date"}, inplace=True) 
    df = df.drop(['Timestamp'], axis=1)    
    return df
    
#     savepath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/" 
        
#     return df.to_csv(savepath+str(year)+'_'+str(inv)+'.csv', index=False)

In [4]:
def singleCleanFile(year):
    
    outputpath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/"     
    invNo = ['inv01',
             'inv02',
             'inv03',
             'inv04',
             'inv05',
             'inv06',
             'inv07',
             'inv08',
             'inv09',
             'inv10']
    for inv in invNo:
        df = dataClean(year, inv)
        df.to_csv(outputpath+str(year)+'_'+str(inv)+'.csv', index=False)
    return
    

In [5]:
def combineCleanFile(year):
    
    cleanDataFiles = []     
    cleanFilesPath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/"  
    filenames = os.listdir(cleanFilesPath)
    
    for file in filenames:         
        year_match = re.search(re.escape(str(year)), file)    
        if year_match:
            cleanDataFiles.append(cleanFilesPath+file)   
    df_comb = pd.concat((pd.read_csv(filenames, index_col=None, header=0) for filenames in cleanDataFiles))        
            
            
    
    return df_comb.to_csv(cleanFilesPath+str(year)+'_combine.csv', index=False)

In [6]:
singleCleanFile(2020)

In [7]:
combineCleanFile(2020)

In [8]:
singleCleanFile(2021)

In [9]:
combineCleanFile(2021)

In [10]:
def quarter(x):
    if x < 16:
        return 1
    elif x < 31:
        return 2
    elif x < 46:
        return 3
    else:
        return 4

In [11]:
def dashboardData(year):
    
    cleanFilesPath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/"  
    
    df_clean = pd.read_csv(cleanFilesPath+str(year)+'_combine.csv',index_col=None, header=0)
    df_clean.loc[:, 'Quarter'] = df_clean.Minute.apply(lambda x: quarter(x))
    df_clean['Date_Hour_Quarter_Inv'] = df_clean['Date'].astype(str) + '_' + df_clean['Hour'].astype(str) + '_' + df_clean['Quarter'].astype(str) + '_' + df_clean['Inverter'].astype(str)
    df_clean = df_clean.drop(['Date', 'Hour', 'Minute', 'Quarter', 'Inverter'], axis=1)
    df_clean['Today_Energy_Copy'] = df_clean['Today_Energy']
    df_clean['Wms_Irr_Max'] = df_clean['Wms_Irr']
    df_clean['Wms_Irr_Min'] = df_clean['Wms_Irr']
    df_clean = df_clean.drop(['Wms_Irr'], axis=1)
    
    agg_functions = {'Date_Hour_Quarter_Inv': 'first', 
                     'Inv_Temp': 'mean',
                     'Wms_Irr_Min': 'min',
                     'Wms_Irr_Max': 'max',                 
                     'Wms_Temp': 'mean',
                     'Today_Energy': 'min',
                     'Today_Energy_Copy': 'max',
                     'Total_Energy': 'max',}
    #create new DataFrame by combining rows with same id values
    df_clean = df_clean.groupby(df_clean['Date_Hour_Quarter_Inv']).aggregate(agg_functions)   
    df_clean['Quarter_Energy'] = df_clean['Today_Energy_Copy'] - df_clean['Today_Energy']
    df_clean = df_clean.drop(['Today_Energy', 'Today_Energy_Copy'], axis=1)
    df_clean['Date'] = df_clean['Date_Hour_Quarter_Inv'].str.split('_').str[0]
    df_clean['Hour'] = df_clean['Date_Hour_Quarter_Inv'].str.split('_').str[1]    
    df_clean['Quarter'] = df_clean['Date_Hour_Quarter_Inv'].str.split('_').str[2]
    df_clean['Inverter'] = df_clean['Date_Hour_Quarter_Inv'].str.split('_').str[3]
    df_clean = df_clean.drop(['Date_Hour_Quarter_Inv'], axis=1)   
    
    df_clean['random'] = random.uniform(0, 1)
    df_clean.loc[:, 'random'] = df_clean.random.apply(lambda x:  random.uniform(0.5, 1.5)) 
    df_clean['Predict_Quarter_Energy'] = df_clean['Quarter_Energy']*df_clean['random']
    df_clean = df_clean.drop(['random'], axis=1)    
    
    return df_clean.to_csv(cleanFilesPath+'dashboardData_' + str(year)+'.csv', index=False)
#     return df_clean

In [12]:
dashboardData(2020)

In [13]:
dashboardData(2021)

In [14]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(files(2020,'inv02')['Today_Energy'], palette="Set1");
# plt.title('Boxplot for Today_Energy', fontsize = 15)
# plt.xlabel('Energy (MWh)')
# plt.show();

In [15]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(files(2020,'inv02')['Total_Energy'], palette="Set1");
# plt.title('Boxplot for Total_Energy', fontsize = 15)
# plt.xlabel('Energy (MWh)')
# plt.show();

In [16]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(files(2020,'inv02')['Inv_Temp'], palette="Set1");
# plt.title('Boxplot for Inv_Temp', fontsize = 15)
# plt.xlabel('C')
# plt.show();

In [17]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(files(2020,'inv02')['Wms_Temp'], palette="Set1");
# plt.title('Boxplot for Wms_Temp', fontsize = 15)
# plt.xlabel('C')
# plt.show();

In [18]:
# plt.rcParams['figure.figsize'] = (10, 4)
# sns.boxplot(files(2020,'inv02')['Wms_Irr'], palette="Set1");
# plt.title('Boxplot for Wms_Irr', fontsize = 15)
# plt.xlabel('W/m^2')
# plt.show();