# Imports

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## Reading file

In [12]:
df = pd.read_parquet('../data/chargecurves_train.parquet', engine='pyarrow')

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format) 
df.describe()


Here we see that we have 3960349.00 observations of different charging sessions. We have a sub_id that goes from 0 to 39. This means that we for each car look at their changes in SOC and Power over 40 minutes. We have some observations that are invalid. SOC cant be more than 100 as a car can't have more than 100% charge. Some power observations are abnormaly high. These issues we will handle later

# Handling invalid data

## Remove invalid SOC over 100

In [14]:
df.loc[df['soc'] > 100, 'soc'] = np.nan


## Remove invalid power values with power higher than nominal power

In [None]:
df_over_nominal = df.loc[df['power'] > df['nominal_power']+1, ['power', 'nominal_power', 'location_id', 'timestamp', 'id']]
df_over_nominal['difference'] = df_over_nominal['power'] - df_over_nominal['nominal_power']

df = df[~df['id'].isin(df_over_nominal['id'])]


# Plot the histogram for the difference
fig = px.histogram(df_over_nominal, x='difference', title='Power over nominal power')
fig.show()




A charger can't give more power than their nominal_power. Therefor a power observations that is higher than the nominal power is invalid. We have set a treshold of 1 over the nominal_power as we believe a minimal difference won't impact model performance. We can see in the histogram that we have a large spread of these differences.

# Reshaping the dataFrame

In [16]:
def reshape_dataframe(df):
    # First, get the first timestamp for each id
    first_timestamps = df.groupby('id')['timestamp'].first().reset_index()
    
    # Add the 'month' column based on the timestamp
    df.loc[:, 'month'] = df['timestamp'].dt.month
    
    # Pivot the data to create explicit columns for soc and power
    pivot_df = df.pivot(index=['id', 'nominal_power', 'location_id'], 
                        columns='sub_id', 
                        values=['soc', 'power']).reset_index()
    
    # Modify column names to avoid multi-indexing
    pivot_df.columns = [
        f"{col[0]}_{int(col[1])}" if isinstance(col[1], (int, float)) else col[0]  
        for col in pivot_df.columns
    ]
    
    # Merge the first timestamp, season, and month back into the pivoted DataFrame
    # Add the 'timestamp' column from first_timestamps
    result_df = pivot_df.merge(first_timestamps[['id', 'timestamp']], on='id')
    
    # Merge the month column
    result_df = result_df.merge(df[['id','month']].drop_duplicates(), on='id', how='left')
    
    return result_df


In [None]:
df_reshaped = reshape_dataframe(df)
df_reshaped

We reshaped the data so each row is one charging session

## Filling in missing power and soc values 

In [None]:
def fill_and_remove_missing(df):
    """
    - For columns 'power_1' to 'power_39' and 'soc_1' to 'soc_39':
      Fills missing values using the average of the previous and next values.

    - After that, any remaining missing values in any column will cause the row to be removed.

    Parameters:
    - df (pd.DataFrame): The DataFrame with missing values.

    Returns:
    - pd.DataFrame: The DataFrame with missing values filled and rows with remaining NaNs removed.
    """
    df = df.copy()  # Avoid modifying the original DataFrame

    # Handle 'power_1' to 'power_39' and 'soc_1' to 'soc_39'
    for prefix in ['power', 'soc']:
        for i in range(1, 40):  # Covers power_1 to power_39 and soc_1 to soc_39
            column_name = f'{prefix}_{i}'

            if column_name in df.columns:
                # Fill missing values using the average of the previous and next values
                for idx in df[df[column_name].isnull()].index:
                    if 0 < idx < len(df) - 1:  # Ensure we don't go out of bounds
                        prev_value = df.at[idx - 1, column_name] if pd.notnull(df.at[idx - 1, column_name]) else None
                        next_value = df.at[idx + 1, column_name] if pd.notnull(df.at[idx + 1, column_name]) else None

                        if prev_value is not None and next_value is not None:
                            df.at[idx, column_name] = (prev_value + next_value) / 2

    # Calculate number of rows before dropping NaNs
    rows_before = df.shape[0]

    # Remove rows with any remaining missing values
    df = df.dropna()

    # Calculate number of rows removed
    rows_after = df.shape[0]
    rows_removed = rows_before - rows_after

    print(f'Number of rows removed due to missing values: {rows_removed}')

    return df

# Example usage:
df_cleaned = fill_and_remove_missing(df_reshaped)


We handle missing values by computing the average between the the previous and the next power or soc reading. After this is done, most of the charging sessions does'nt have missing readings. With this number being so low, we have decided to remove the charging sessions if there are multiple consecutive missing values.

## Adding average temperature for each month

In [19]:
def add_temp_col(df):
  df = df.copy()
  temperature_data = {
      'month': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
      'temperatur': [-4, -4, 0, 5, 10, 14, 17, 15, 11, 5, 1, -3]
  }
  temp_df = pd.DataFrame(temperature_data)
  df = df.merge(temp_df, on='month', how='left')
  df = df.drop(columns=['month'])
  return df

We have found a strong indication between the temperature and the way the power behaves. Therefor we have created a new variable that takes the average temperature of each month. 

## Uploading the cleaned filed for further modeling

In [None]:
df_cleaned_for_model = add_temp_col(df_cleaned)
df_cleaned_for_model.drop(columns=['id' ,'timestamp'], inplace=True)
df_cleaned_for_model.to_parquet('../data/chargecurves_train_cleaned.parquet', engine='pyarrow')
df_cleaned_for_model

# Visualizations and data exploration

## Distribution of charging sessions over time

In [None]:
fig = px.histogram(df_cleaned, x="timestamp", title='Number of charging sessions over time')
fig.show()

## Power consumption at different times of day

In [None]:
# Plot histogram of total_power consumption by hour
power_cols = [f'power_{i}' for i in range(40)]
df_cleaned['hour'] = df_cleaned['timestamp'].dt.hour

df_cleaned['total_power'] = df_cleaned[power_cols].sum(axis=1)
plt.figure(figsize=(12, 6))
sns.histplot(df_cleaned, x='hour', weights='total_power', bins=24, kde=True)

# Customize plot
plt.xlabel("Hour of the Day")
plt.ylabel("Total Power Consumption")
plt.title("Total Power Consumption at Different Times of the Day")
plt.xticks(range(0, 24))  # Ensure all hours are labeled
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show plot
plt.show()

## Overview of distributions between different nominal chargers

In [None]:
# Define the categorize function for nominal power
def categorize_nominal_power(nominal_power):
    if nominal_power <= 100:
        return 'Low Power 0-100 DC'
    elif nominal_power <= 150:
        return 'Medium Power 101-150 DC'
    elif nominal_power <= 300:
        return 'High Power 151-300 DC'
    else:
        return 'Very High Power 301+ DC'

# Apply the categorize function to the 'nominal_power' column
df_cleaned['nominal_power_category'] = df_cleaned['nominal_power'].apply(categorize_nominal_power)

# Sort the dataframe by nominal_power
df_sorted = df_cleaned.sort_values('nominal_power')

# Create the pie chart based on the nominal_power_category
fig_pie = px.pie(df_cleaned, 
                 names='nominal_power_category', 
                 title="Distribution of Nominal Power Categories",
                 labels={"nominal_power_category": "Nominal Power Category"},
                 color='nominal_power_category',  # Color by category
                 color_discrete_map={
                     'Low Power 0-100 DC': 'lightblue',
                     'Medium Power 101-150 DC': 'lightskyblue',
                     'High Power 151-300 DC': 'yellowgreen',
                     'Very High Power 301+ DC': 'darkred'
                 },
                 category_orders={"nominal_power_category": ['Low Power 0-100 DC', 
                                                             'Medium Power 101-150 DC', 
                                                             'High Power 151-300 DC', 
                                                             'Very High Power 301+ DC']})  # Sort legend

# Show the pie chart
fig_pie.update_layout(
    width=600,  # Set the width of the pie chart
    height=600  # Set the height of the pie chart
)

fig_pie.show()



In [None]:
# Melt the power columns to long format and extract the minute information
power_columns = [f'power_{i}' for i in range(1, 40)]  # 'power_1' to 'power_39'
soc_columns = [f'soc_{i}' for i in range(0, 40)]  # 'soc_0' to 'soc_39'

# Melt the power columns
long_power_df = df_cleaned.melt(id_vars=[], value_vars=power_columns, 
                        var_name='minute', value_name='power')

# Extract the minute from the column names (e.g., 'power_1' -> 1)
long_power_df['minute'] = long_power_df['minute'].str.extract('(\\d+)').astype(int)

# Melt the SOC columns
long_soc_df = df_cleaned.melt(id_vars=[], value_vars=soc_columns, 
                      var_name='minute', value_name='soc')

# Extract the minute from the column names (e.g., 'soc_0' -> 0)
long_soc_df['minute'] = long_soc_df['minute'].str.extract('(\\d+)').astype(int)

# Compute the median of all the powers for each minute
overall_median = long_power_df.groupby('minute')['power'].median().reset_index()

# Calculate the average SOC for each minute
soc_trend = long_soc_df.groupby('minute')['soc'].mean().reset_index()

# Create the combined line plot
fig = go.Figure()

# Add the median of all powers
fig.add_trace(go.Scatter(x=overall_median['minute'], y=overall_median['power'], 
                         mode='lines', name='Power trend',
                         line=dict(color='blue')))

# Add the SOC trend
fig.add_trace(go.Scatter(x=soc_trend['minute'], y=soc_trend['soc'], 
                         mode='lines', name='SOC Trend', 
                         line=dict(color='green')))

# Customize the layout
fig.update_layout(title="Overall Median Power and SOC Trend Over Time",
                  xaxis_title="Minute",
                  yaxis_title="Value",
                  legend_title="Power vs SOC")

# Show the plot
fig.show()


## Median power for each season

In [None]:
# Assuming 'timestamp' is in datetime format, if not, convert it
df_cleaned['timestamp'] = pd.to_datetime(df_cleaned['timestamp'])

# Create a 'season' variable based on the month of the year
def assign_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Apply the 'assign_season' function to the 'timestamp' column to create a 'season' column
df_cleaned['season'] = df_cleaned['timestamp'].dt.month.apply(assign_season)

# Melt the power columns to long format and extract the minute information
power_columns = [f'power_{i}' for i in range(1, 40)]  # 'power_1' to 'power_39'
soc_columns = [f'soc_{i}' for i in range(0, 40)]  # 'soc_0' to 'soc_39'

# Loop through each unique season
different_seasons = sorted(df_cleaned['season'].unique())

# Use Plotly's built-in color scale (e.g., 'Viridis') for different lines
colors = px.colors.qualitative.Set1  # You can use any color scale here

# Create a figure
fig = go.Figure()

# Loop through all seasons and compute the overall median power for each
for idx, season in enumerate(different_seasons):
    # Filter the dataframe for the specific season
    filtered_df = df_cleaned[df_cleaned['season'] == season]
    
    # Melt the power columns for this season
    long_power_df = filtered_df.melt(id_vars=['season'], value_vars=power_columns, 
                                      var_name='minute', value_name='power')
    
    # Extract the minute from the column names (e.g., 'power_1' -> 1)
    long_power_df['minute'] = long_power_df['minute'].astype(str).str.extract(r'(\d+)')

    # Drop rows where 'minute' is NaN
    long_power_df = long_power_df.dropna(subset=['minute'])

    # Convert 'minute' to integer after dropping NaN
    long_power_df['minute'] = long_power_df['minute'].astype(int)

    # Compute the median of all the powers for each minute
    median_power = long_power_df.groupby('minute')['power'].median().reset_index()
    
    # Add the median of this season to the plot, assign a unique color for each
    fig.add_trace(go.Scatter(x=median_power['minute'], y=median_power['power'], 
                             mode='lines', name=f'Median Power {season}',
                             line=dict(color=colors[idx % len(colors)])))  # Color cycling

# Melt the SOC columns
long_soc_df = df_cleaned.melt(id_vars=[], value_vars=soc_columns, 
                      var_name='minute', value_name='soc')

# Extract the minute from the column names (e.g., 'soc_0' -> 0)
long_soc_df['minute'] = long_soc_df['minute'].str.extract(r'(\d+)')

# Drop rows where 'minute' is NaN
long_soc_df = long_soc_df.dropna(subset=['minute'])

# Convert 'minute' to integer after dropping NaN
long_soc_df['minute'] = long_soc_df['minute'].astype(int)

# Customize the layout
fig.update_layout(title="Median Power for Each Season",
                  xaxis_title="Minute",
                  yaxis_title="Value",
                  legend_title="Seasons")

# Show the plot
fig.show()


## Median power for each month

In [None]:
# Melt the power columns to long format and extract the minute information
power_columns = [f'power_{i}' for i in range(1, 40)]  # 'power_1' to 'power_39'

# Assuming 'month' is a column in your DataFrame, which categorizes each row into a month
# Loop through each unique month
different_months = sorted(df_cleaned['month'].unique())

# Month names mapping
month_names = {
    1: 'January avg. temp. -4', 2: 'February avg. temp. -4', 3: 'March avg. temp. 0', 4: 'April avg. temp. 5', 5: 'May avg. temp. 10', 6: 'June avg. temp. 14',
    7: 'July avg. temp. 17', 8: 'August avg. temp. 15', 9: 'September avg. temp. 11', 10: 'October avg. temp. 5', 11: 'November avg. temp. 1', 12: 'December avg. temp. -3'
}

# Average temperatures for each month
month_temperatures = {
    1: -4, 2: -4, 3: 0, 4: 5, 5: 10, 6: 14, 7: 17, 8: 15, 9: 11, 10: 5, 11: 1, 12: -3
}

# Create a figure
fig = go.Figure()

# Get the minimum and maximum temperature values across all months for normalization
min_temp = min(month_temperatures.values())
max_temp = max(month_temperatures.values())

# Loop through all months and compute the overall median power for each
for idx, month in enumerate(different_months):
    # Filter the dataframe for the specific month
    filtered_df = df_cleaned[df_cleaned['month'] == month]
    
    # Melt the power columns for this month
    long_power_df = filtered_df.melt(id_vars=['month'], value_vars=power_columns, 
                                      var_name='minute', value_name='power')
    
    long_power_df['minute'] = long_power_df['minute'].astype(str).str.extract(r'(\d+)').astype(int)

    # Compute the median of all the powers for each minute
    median_power = long_power_df.groupby('minute')['power'].median().reset_index()

    # Get the average temperature for the current month
    avg_temp = month_temperatures[month]

    # Normalize the temperature to [0, 1] for the diverging color scale
    norm_temp = (avg_temp - min_temp) / (max_temp - min_temp)

    # Create a color for this month based on the normalized temperature using the reversed 'RdBu' diverging scale
    color = px.colors.diverging.RdBu[::-1][int(norm_temp * (len(px.colors.diverging.RdBu) - 1))]

    # Add the median of this month to the plot, assign a unique color for each
    fig.add_trace(go.Scatter(x=median_power['minute'], y=median_power['power'], 
                             mode='lines', name=month_names[month],
                             line=dict(color=color)))  # Diverging color based on normalized temp

# Customize the layout
fig.update_layout(title="Median Power for Each Month colored by average temperature",
                  xaxis_title="Minute",
                  yaxis_title="Power",
                  legend_title="Months and average temperature")

# Show the plot
fig.show()
