In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split


In [2]:
np.random.seed(42)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
data = pd.read_parquet("/Users/asegrethe/Documents/CEDASHackathon2025/src/data/cedas2025_material/data/chargecurves_train.parquet")
data.head()


In [4]:
power=(data['power'] > 500)


Fjerner hvis power er mer enn 100, ettersom vi kan ikke ha mer enn 100 

In [None]:
soc=(data['soc'] > 100).sum()
soc

In [None]:
ids_above_100 = data.loc[data['soc'] > 100, 'id'].unique()
px.histogram(data.loc[data['id'].isin(ids_above_100), 'soc'], title='SOC values above 100%')

In [7]:
def soc_outliers(data):
    ids_above_100 = data.loc[data['soc'] > 100, 'id'].unique()
    ids_below_0 = data.loc[data['soc'] < 0, 'id'].unique()
    return ids_above_100, ids_below_0

Vanlig power, unormale verdier, fjerne?

In [None]:
mean_power = data['power'].mean()
median_power = data['power'].median()
max_power = data['power'].max()
nominal_max=data['nominal_power'].max()


mean_power, median_power, max_power, nominal_max



In [None]:
fig = px.histogram(data, x="power", nbins=50, title="Distribution of Power-values")
fig.show()

In [None]:
fig = px.histogram(data, x="nominal_power", nbins=50, title="Distribution of Nominal Power-values")
fig.show()

outliers

In [None]:
fig = px.histogram(data[data['power'] <= 500], x="power", nbins=50, title="Distributions of Power (without outliers)")
fig.show()

In [12]:
data = data[data['power'] <= 500]

Sjekker om power er mer enn nominal power, i de tilfellene setter vi dem ned 

In [13]:
def power_nominal(data):
    if data['power'] > data['nominal_power']:
        data['power'] = data['nominal_power']
    return data


In [None]:
data.describe()

In [None]:
data[data['id']==1]

## Line plot for id 1

In [None]:
# Create the line plot
px.line(data[data['id']==1], x='timestamp', y='power', title="Charging Curve")

In [None]:
soc_values = data.loc[data['id'] == 1]
px.line(soc_values, x='timestamp', y='soc', title="SOC Curve")

In [None]:
filtered_data = data[data['id'] == 1].copy()

filtered_data['timestamp'] = pd.to_datetime(filtered_data['timestamp'])
fig = px.line(filtered_data, x='timestamp', y=['soc', 'power'], title="SOC & Charging Power Overlap")

fig.show()

### plot the time and the power

In [None]:
grouped_summary = data.groupby(['nominal_power', 'location_id']).agg(
    count=('id', 'count'), 
    avg_power=('power', 'mean'), 
    avg_soc=('soc', 'mean') ) 

fig = px.histogram(grouped_summary, x="avg_power", title="Histogram of Average Power", nbins=30)
fig.show()

In [None]:
fig_train = px.histogram(
    data,
    x=['power', 'soc'],
    title='power vs soc',
    barmode='overlay'
)
fig_train.update_traces(opacity=0.6)
fig_train.show()


In [None]:
# Count NaNs per id
nan_counts = data[['id', 'soc', 'power']].isna().groupby(data['id']).sum()

# Get IDs where either 'soc' or 'power' has more than 15 NaNs
problematic_ids = nan_counts[(nan_counts > 15).any(axis=1)].index
problematic_ids.tolist()

Are there "groups of sessions" that look similar in any way?

In [None]:
fig = px.histogram(data, x="power", color="location_id", 
                   title="Power Distribution by Charging Station")

fig.show()

In [None]:
from scipy.stats import f_oneway

# Group power values by location_id
groups = [group['power'].values for _, group in data.groupby('location_id')]

# Perform one-way ANOVA
anova_result = f_oneway(*groups)
print(f"ANOVA p-value: {anova_result.pvalue}")

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Create the Tukey test
tukey = pairwise_tukeyhsd(endog=data['power'], groups=data['location_id'], alpha=0.05)

# Print the results
print(tukey)

In [None]:
avg_power_by_location = data.groupby('location_id')['power'].mean().sort_values()
print(avg_power_by_location)

## Split the data

In [24]:
train_data_full = data
id=data['id'].unique()

train, validation = train_test_split(train_data_full, train_size=0.70, test_size=0.15)
test, validation = train_test_split(validation, train_size=0.5)
