In [7]:
import pandas as pd
import numpy as np

# Paths to data files
path_train_data = '../../0raw_data/2010-2011 Solar home electricity data.csv'
path_eval_data = '../../0raw_data/2011-2012 Solar home electricity data.csv'
path_test_data = '../../0raw_data/2012-2013 Solar home electricity data.csv'

# Function to load data
def loadTrainEvalTest(pathTrain, pathEval, pathTest):
    df_load_train = pd.read_csv(pathTrain, header=1, parse_dates=['date'], date_format="%d/%m/%y")
    df_load_eval = pd.read_csv(pathEval, header=1, parse_dates=['date'], date_format="%d/%m/%y")
    df_load_test = pd.read_csv(pathTest, header=1, parse_dates=['date'], date_format="%d/%m/%y")
    return pd.concat([df_load_train, df_load_eval, df_load_test])

# Function to combine controlled and general consumption
def combineControlledAndGeneralConsumption(df):
    if 'CL' in df['Consumption Category'].values:
        combinedLoad = pd.DataFrame(columns=df.columns)
        cl = df[df['Consumption Category'] == 'CL'].set_index(pd.RangeIndex(0, len(df)//2))
        gc = df[df['Consumption Category'] == 'GC'].set_index(pd.RangeIndex(0, len(df)//2))
        clgc = pd.concat([cl, gc])
        for i in range(len(cl)):
            sum = clgc.loc[i].sum()
            sum['date'] = i
            combinedLoad.loc[i] = sum
        final_load = combinedLoad
    else:
        final_load = df[df['Consumption Category'] == 'GC']
    
    load_array = np.array(final_load.drop(columns=['Consumption Category', 'date'])).flatten()
    return pd.DataFrame({"Load": load_array}) 

# Function to process data for a specific customer
def processCustomerData(df, customer):
    customer_df = df[df['Customer'] == customer].drop(columns=['Customer'])
    processed_df = combineControlledAndGeneralConsumption(customer_df)
    
    pv_array = np.array(customer_df[customer_df['Consumption Category'] == 'GG'].drop(columns=['Consumption Category', 'date']).set_index(pd.RangeIndex(0, len(processed_df)))).flatten()
    processed_df.insert(1, "PV", pv_array)
    
    # Rename columns with customer number
    processed_df.columns = [f"{col}_{customer}" for col in processed_df.columns]
    
    return processed_df

# Load data
df = loadTrainEvalTest(path_train_data, path_eval_data, path_test_data)

# Remove unnecessary data
df = df.drop(columns=['Generator Capacity', 'Postcode', 'Row Quality'])

# Initialize an empty DataFrame to store concatenated results
all_customers_df = pd.DataFrame()

# Process data for all customers (1 to 300)
for customer in range(1, 301):
    processed_df = processCustomerData(df, customer)
    if all_customers_df.empty:
        all_customers_df = processed_df
    else:
        all_customers_df = pd.concat([all_customers_df, processed_df], axis=1)

# Save the concatenated DataFrame
#all_customers_df.to_csv("../processed_load_pv_all_customers.csv", sep=',', index=False, encoding='utf-8')


ValueError: Length mismatch: Expected 1096 rows, received array of length 1644

In [2]:
import pandas as pd
import numpy as np

path_train_data = '../../0raw_data/2010-2011 Solar home electricity data.csv'
path_eval_data = '../../0raw_data/2011-2012 Solar home electricity data.csv'
path_test_data = '../../0raw_data/2012-2013 Solar home electricity data.csv'

def loadTrainEvalTest(pathTrain, pathEval, pathTest):
    df_load_train = pd.read_csv(pathTrain, header=1, parse_dates=['date'], date_format="%d/%m/%y")
    df_load_eval = pd.read_csv(pathEval, header=1, parse_dates=['date'], date_format="%d/%m/%y")
    df_load_test = pd.read_csv(pathTest, header=1, parse_dates=['date'], date_format="%d/%m/%y")

    return pd.concat([df_load_train, df_load_eval, df_load_test])

def combineControlledAndGeneralConsumption(df):
    if (df == 'CL').any().any():
        combinedLoad = pd.DataFrame(columns=df.columns)
        cl = df[df['Consumption Category'] == 'CL'].set_index(pd.RangeIndex(0, 1096))
        gc = df[df['Consumption Category'] == 'GC'].set_index(pd.RangeIndex(0, 1096))
        clgc = pd.concat([cl,gc])
        for i in range(0,1096):
            sum = clgc.loc[i].sum()
            sum['date'] = i
            combinedLoad.loc[i] = sum
        final_load = combinedLoad
    else:
        final_load = df[df['Consumption Category']=='GC']
    
    load_array = np.array(final_load.drop(columns=['Consumption Category', 'date'])).flatten()
    return pd.DataFrame({"Load":load_array}) 

# Load data
df = loadTrainEvalTest(path_train_data,path_eval_data,path_test_data)

# Remove unneccessary data
df = df.drop(columns=['Generator Capacity', 'Postcode', 'Row Quality'])

# Choose customer
customer = 1
customer_df = df[df['Customer']==customer].drop(columns=['Customer'])

# Combine controlled and general consumption if needed
proccessed_df = combineControlledAndGeneralConsumption(customer_df)

# Get pv prodcution
pv_array = np.array(customer_df[customer_df['Consumption Category']=='GG'].drop(columns=['Consumption Category', 'date']).set_index(pd.RangeIndex(0, 1096))).flatten()
proccessed_df.insert(1,"PV",pv_array)

# Save df
proccessed_df.to_csv("../processed_load_pv_"+str(customer)+".csv", sep=',', index=False, encoding='utf-8')

In [6]:
# Assuming loadTrainEvalTest is a function that loads the datasets
df = loadTrainEvalTest(path_train_data, path_eval_data, path_test_data)

# Initialize an empty DataFrame for the concatenated results
final_df = pd.DataFrame()

for customer in range(1, 301):
    # Filter data for the current customer and remove unnecessary columns
    customer_df = df[df['Customer'] == customer].drop(columns=['Generator Capacity', 'Postcode', 'Row Quality', 'Customer'])
    
    # Process data (combine controlled and general consumption if needed)
    processed_df = combineControlledAndGeneralConsumption(customer_df)
    
    # Extract PV production data
    pv_array = np.array(processed_df[processed_df['Consumption Category'] == 'GG'].drop(columns=['Consumption Category', 'date']).set_index(pd.RangeIndex(len(processed_df)))).flatten()
    
    # Insert PV data into the DataFrame and rename the load column
    processed_df.insert(1, f"PV_{customer}", pv_array)
    processed_df.rename(columns={'Load': f'Load_{customer}'}, inplace=True)
    
    # If this is the first customer, initialize final_df with processed_df
    if customer == 1:
        final_df = processed_df
    else:
        # For subsequent customers, merge the data on the index (assumes matching time intervals)
        final_df = pd.concat([final_df, processed_df[[f"PV_{customer}", f"Load_{customer}"]]], axis=1)

# Save the concatenated DataFrame to CSV
#final_df.to_csv("../processed_load_pv_all_customers.csv", sep=',', index=False, encoding='utf-8')

KeyError: 'Consumption Category'