In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Read the dataframe
df = pd.read_csv('data/order_export_20250930T20240405.csv')


In [2]:
# For each store, create a time series DataFrame where each column is a product (by name)
# and each cell is the revenue for that product on that day. Fill missing values with 0.

# Convert 'updated' to datetime and extract date only
df['date'] = pd.to_datetime(df['updated']).dt.date

# Get the full date range in the data
all_dates = pd.date_range(df['date'].min(), df['date'].max())

# Get all unique stores
stores = df['store'].unique()

for store in stores:
    # Filter data for the current store
    store_df = df[df['store'] == store]
    # Pivot: index=date, columns=product name, values=sum of line_price
    pivot = store_df.pivot_table(
        index='date',
        columns='name',
        values='line_price',
        aggfunc='sum',
        fill_value=0
    )
    # Reindex to include all dates, fill missing with 0
    pivot = pivot.reindex(all_dates.date, fill_value=0)
    pivot.index.name = 'date'
    # Save to CSV
    filename = f"data/{store.split()[0]}_time_series.csv"
    pivot.reset_index().to_csv(filename, index=False)


In [3]:
#read bosund/data/weather/weather_jakobstad.csv
weather_df = pd.read_csv('bosund/data/weather/weather_jakobstad.csv')

# Drop all columns in df that have only NaN values
weather_df = weather_df.dropna(axis=1, how='all')


In [4]:
weather_df.head()

Unnamed: 0,Time,Air temperature,Dew-point temperature,Gust speed,Pressure (msl),Relative humidity,Wind direction,Wind speed
0,2023-10-03 20:10:00,10.3,9.0,5.5,997.9,92.0,198.0,5.0
1,2023-10-03 20:20:00,10.3,9.1,5.4,997.8,93.0,190.0,4.5
2,2023-10-03 20:30:00,10.3,9.1,4.6,997.7,92.0,195.0,3.8
3,2023-10-03 20:40:00,10.4,9.1,3.4,997.6,92.0,199.0,3.0
4,2023-10-03 20:50:00,10.3,9.2,3.6,997.6,93.0,178.0,3.1


In [5]:
# analyse weather_df
weather_df.describe()
weather_df.info()
weather_df.isnull().sum()
weather_df.isnull().sum().sum()
weather_df.isnull().sum().sum() / len(weather_df)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207919 entries, 0 to 207918
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Time                   207919 non-null  object 
 1   Air temperature        105120 non-null  float64
 2   Dew-point temperature  105120 non-null  float64
 3   Gust speed             207909 non-null  float64
 4   Pressure (msl)         105119 non-null  float64
 5   Relative humidity      105120 non-null  float64
 6   Wind direction         207909 non-null  float64
 7   Wind speed             207909 non-null  float64
dtypes: float64(7), object(1)
memory usage: 12.7+ MB


np.float64(1.9778230945704818)

In [12]:
#drop nan rows in weather_df
weather_df = weather_df.dropna()
# Drop all rows before 2024-04-05
weather_df = weather_df[weather_df['Time'] >= '2024-04-05']
# Instead of filtering for 12:00:00, calculate the daily average for each column
weather_df['Date'] = weather_df['Time'].str[:10]
# Exclude the 'Time' column from averaging, keep only numeric columns
numeric_cols = weather_df.select_dtypes(include='number').columns
weather_df = weather_df.groupby('Date')[numeric_cols].mean().reset_index()
weather_df.rename(columns={'Date': 'Time'}, inplace=True)

weather_df



Unnamed: 0,Time,Air temperature,Dew-point temperature,Gust speed,Pressure (msl),Relative humidity,Wind direction,Wind speed
0,2024-04-05,-2.0,-9.8,10.5,1014.9,55.0,122.0,8.6
1,2024-04-06,-2.4,-7.2,5.8,1011.5,70.0,256.0,4.6
2,2024-04-07,1.7,1.4,8.7,998.2,98.0,178.0,7.8
3,2024-04-08,4.3,1.7,10.2,1002.4,83.0,208.0,9.4
4,2024-04-09,6.0,2.5,5.5,1010.4,78.0,247.0,4.7
...,...,...,...,...,...,...,...,...
541,2025-09-28,13.3,10.0,5.5,1034.9,81.0,206.0,4.9
542,2025-09-29,12.1,6.6,9.7,1034.5,69.0,204.0,8.7
543,2025-09-30,10.9,4.8,10.4,1035.2,66.0,209.0,8.8
544,2025-10-01,10.6,2.6,10.5,1036.3,58.0,192.0,9.0


In [11]:
# Save weather_df to csv
weather_df.to_csv('bosund/data/weather/weather_jakobstad_cleaned.csv', index=False)