In [1]:
import pandas as pd
import os
import re
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import shutil
from scipy import stats
import requests
import random
import numpy as np
from sklearn.svm import OneClassSVM


In [2]:
inverters = {
    'inv01':'wms01',
    'inv02':'wms01',
    'inv03':'wms01',
    'inv04':'wms01',
    'inv05':'wms01',
    'inv06':'wms02',
    'inv07':'wms02',
    'inv08':'wms02',
    'inv09':'wms02',
    'inv10':'wms02',
}

In [3]:
def dataClean(year, inv):
       
    dirpath = "/Users/qilu/Desktop/Advanticsys/Data/" + str(year)
    filenames = os.listdir(dirpath)
    

    for file in filenames:         
        inv_today_match = re.search(re.escape(inv) + r'_today_energy', file)    
        if inv_today_match:
            df_today_energy = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)    
            
            df_today_energy['Time_Diff'] = df_today_energy['Timestamp'].diff()
            df_today_energy['Energy_Diff'] = df_today_energy['Value'].diff()  
            
        
            df_today_energy = df_today_energy[(df_today_energy[['Time_Diff']] == 60000).all(axis=1)]            
            df_today_energy = df_today_energy[(df_today_energy[['Energy_Diff']] > 0).all(axis=1)]              
        
            df_today_energy = df_today_energy.drop(['Time_Diff'], axis=1)  
            df_today_energy = df_today_energy.drop(['Value'], axis=1)              
            
            df_today_energy.rename(columns={"Energy_Diff": "Energy", "Signal": "Inverter"}, inplace=True)
            df_today_energy['Inverter'] = df_today_energy['Inverter'].str.split('_').str[0]

            EnergyQ1 = df_today_energy.Energy.quantile(0.25)
            EnergyQ3 = df_today_energy.Energy.quantile(0.75)
            EnergyUpper = EnergyQ3 + 1.5*(EnergyQ3-EnergyQ1)
            df_today_energy = df_today_energy[(df_today_energy[['Energy']] < EnergyUpper).all(axis=1)]
            
    for file in filenames:         
        inv_total_match = re.search(re.escape(inv) + r'_total_energy', file)    
        if inv_total_match:
            df_total_energy = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            
            df_total_energy = df_total_energy[(df_total_energy[['Value']] > 0).all(axis=1)]
            df_total_energy.rename(columns={"Value": "Total_Energy", "Signal": "Inverter"}, inplace=True)
            df_total_energy['Inverter'] = df_total_energy['Inverter'].str.split('_').str[0]

    for file in filenames:         
        inv_temp_match = re.search(re.escape(inv) + r'_temp', file)    
        if inv_temp_match:
            df_inv_temp = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            df_inv_temp.loc[:, 'Value'] = df_inv_temp.Value.apply(lambda x: x/10) 
            
            df_inv_temp = df_inv_temp[(df_inv_temp[['Value']] > 0).all(axis=1)]            
            df_inv_temp.rename(columns={"Value": "Inv_Temp", "Signal": "Inverter"}, inplace=True)          
            df_inv_temp['Inverter'] = df_inv_temp['Inverter'].str.split('_').str[0]   
            
    for file in filenames:         
        wms_temp_match = re.search(re.escape(inverters[inv]) + r'_temp', file)    
        if wms_temp_match:
            df_wms_temp = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            df_wms_temp.loc[:, 'Value'] = df_wms_temp.Value.apply(lambda x: x/10) 
            
            df_wms_temp = df_wms_temp[(df_wms_temp[['Value']] > 0).all(axis=1)]           
            df_wms_temp.rename(columns={"Value": "Wms_Temp", "Signal": "Inverter"}, inplace=True)          
            df_wms_temp['Inverter'] = df_wms_temp['Inverter'].str.split('_').str[0]        
            
    for file in filenames:         
        wms_irr_match = re.search(re.escape(inverters[inv]) + r'_irradiance', file)    
        if wms_irr_match:
            df_wms_irr = pd.read_csv(dirpath+'/'+file,delimiter=';',index_col=None, header=0)
            
            df_wms_irr = df_wms_irr[(df_wms_irr[['Value']] > 0).all(axis=1)]                
            df_wms_irr.rename(columns={"Value": "Wms_Irr", "Signal": "Inverter"}, inplace=True)          
            df_wms_irr['Inverter'] = df_wms_irr['Inverter'].str.split('_').str[0]

          
    df = pd.merge(df_today_energy,
                  df_total_energy[['Timestamp', 'Total_Energy']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_inv_temp[['Timestamp', 'Inv_Temp']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_wms_temp[['Timestamp', 'Wms_Temp']],
                  on='Timestamp') 
    
    df = pd.merge(df,
                  df_wms_irr[['Timestamp', 'Wms_Irr']],
                  on='Timestamp')    
    
    
    totalEnergyQ1 = df.Total_Energy.quantile(0.25)
    totalEnergyQ3 = df.Total_Energy.quantile(0.75)
    totalEnergyUpper = totalEnergyQ3 + 1.5*(totalEnergyQ3-totalEnergyQ1)
    df = df[(df[['Total_Energy']] < totalEnergyUpper).all(axis=1)]     
    
    invTempQ1 = df.Inv_Temp.quantile(0.25)
    invTempQ3 = df.Inv_Temp.quantile(0.75)
    invTempUpper = invTempQ3 + 1.5*(invTempQ3-invTempQ1)
    invTempLower = invTempQ1 - 1.5*(invTempQ3-invTempQ1)           
    df = df[((df[['Inv_Temp']] > invTempLower)
                              & (df[['Inv_Temp']] < invTempUpper)).all(axis=1)]        
    
    wmsTempQ1 = df.Wms_Temp.quantile(0.25)
    wmsTempQ3 = df.Wms_Temp.quantile(0.75)
    wmsTempUpper = wmsTempQ3 + 1.5*(wmsTempQ3-wmsTempQ1)
    wmsTempLower = wmsTempQ1 - 1.5*(wmsTempQ3-wmsTempQ1)           
    df = df[((df[['Wms_Temp']] > wmsTempLower)
                              & (df[['Wms_Temp']] < wmsTempUpper)).all(axis=1)]    
    
    wmsIrrQ1 = df.Wms_Irr.quantile(0.25)
    wmsIrrQ3 = df.Wms_Irr.quantile(0.75)
    wmsIrrUpper = wmsIrrQ3 + 1.5*(wmsIrrQ3-wmsIrrQ1)
    wmsIrrLower = wmsIrrQ1 - 1.5*(wmsIrrQ3-wmsIrrQ1)           
    df = df[((df[['Wms_Irr']] > wmsIrrLower)
                              & (df[['Wms_Irr']] < wmsIrrUpper)).all(axis=1)]     
    
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.drop(['Timestamp'], axis=1)    
    
#     This part need to be deleted once we have prediction model
    df['random'] = random.uniform(0, 1)
    df.loc[:, 'random'] = df.random.apply(lambda x:  random.uniform(0.5, 1.5)) 
    df['Predict_Energy'] = df['Energy']*df['random']
    df = df.drop(['random'], axis=1)   
    
    
    return df
    
#     savepath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/" 
        
#     return df.to_csv(savepath+str(year)+'_'+str(inv)+'.csv', index=False)

In [4]:
def singleCleanFile(year):
    
    outputpath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/"     
    invNo = ['inv01',
             'inv02',
             'inv03',
             'inv04',
             'inv05',
             'inv06',
             'inv07',
             'inv08',
             'inv09',
             'inv10']
    for inv in invNo:
        df = dataClean(year, inv)
        df.to_csv(outputpath+str(year)+'_'+str(inv)+'.csv', index=False)
    return 
    

In [5]:
def combineCleanFile(year):
    
    cleanDataFiles = []     
    cleanFilesPath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/"  
    filenames = os.listdir(cleanFilesPath)
    
    for file in filenames:         
        year_match = re.search(re.escape(str(year)), file)    
        if year_match:
            cleanDataFiles.append(cleanFilesPath+file)   
    df_comb = pd.concat((pd.read_csv(filenames, index_col=None, header=0) for filenames in cleanDataFiles))        

    
    return df_comb.to_csv(cleanFilesPath+str(year)+'_combine.csv', index=False)

In [6]:
singleCleanFile(2020)

In [7]:
combineCleanFile(2020)

In [8]:
singleCleanFile(2021)

In [9]:
combineCleanFile(2021)

In [10]:
DataFiles = []     
FilesPath = "/Users/qilu/Desktop/Advanticsys/Data/Output_Data/"  
filenames = os.listdir(FilesPath)

for file in filenames:         
    comb_match = re.search('combine', file)    
    if comb_match:
        DataFiles.append(FilesPath+file)   
df_all = pd.concat((pd.read_csv(filenames, index_col=None, header=0) for filenames in DataFiles))  

df_all.to_csv(FilesPath+'all_combine.csv', index=False)