# Preprocessing dataset

### Data
Historical Finnish Fuel Data.

From **FINSTAT / Tilastokesus** Finland’s national statistical authority

### Preprocessing
Remove unwanted columns

Indexing

pd.datetime, pd.to_numeric

Create new dataframes for each fuel type

**Export as CSV files**

In [27]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [28]:
# read csv file
full_data = pd.read_csv('raw_data/polttoneste_hinnat1988-2025.csv', skiprows=1, encoding='latin1')
full_data

Unnamed: 0,Kuukausi,Polttoneste,Hinta,"Hinta, vuosimuutos (%)"
0,1988M01,"Moottoribensiini 95 E 10, snt/l",56.0,.
1,1988M01,"Dieselöljy, snt/l",40.0,.
2,1988M01,"Kevyt polttoöljy, snt/l",16.0,.
3,1988M01,"Kevyt polttoöljy, eur/MWh",16.1,.
4,1988M02,"Moottoribensiini 95 E 10, snt/l",56.0,.
...,...,...,...,...
1783,2025M02,"Kevyt polttoöljy, eur/MWh",132.4,-11.4
1784,2025M03,"Moottoribensiini 95 E 10, snt/l",179.0,-6.8
1785,2025M03,"Dieselöljy, snt/l",179.0,-7.7
1786,2025M03,"Kevyt polttoöljy, snt/l",126.0,-12.5


### Preprocessing full data

In [29]:
# remove unwanted columns
full_data = full_data.drop(columns=['Hinta, vuosimuutos (%)'])

In [30]:
# set date as date time
dates = pd.to_datetime(full_data['Kuukausi'], format="%YM%m")

In [31]:
# set index refactored dates
full_data = full_data.set_index(dates) 

# remove old column
full_data = full_data.drop(columns=['Kuukausi']) 

In [32]:
# Set hinta column as numeric
full_data['Hinta'] = pd.to_numeric(full_data['Hinta'], errors='coerce') 
full_data 

Unnamed: 0_level_0,Polttoneste,Hinta
Kuukausi,Unnamed: 1_level_1,Unnamed: 2_level_1
1988-01-01,"Moottoribensiini 95 E 10, snt/l",56.0
1988-01-01,"Dieselöljy, snt/l",40.0
1988-01-01,"Kevyt polttoöljy, snt/l",16.0
1988-01-01,"Kevyt polttoöljy, eur/MWh",16.1
1988-02-01,"Moottoribensiini 95 E 10, snt/l",56.0
...,...,...
2025-02-01,"Kevyt polttoöljy, eur/MWh",132.4
2025-03-01,"Moottoribensiini 95 E 10, snt/l",179.0
2025-03-01,"Dieselöljy, snt/l",179.0
2025-03-01,"Kevyt polttoöljy, snt/l",126.0


### Preprocessing gasoline data

In [33]:
# filter gasoline data
gas_price_data = full_data[full_data['Polttoneste'].isin(['Moottoribensiini 95 E 10, snt/l'])]
gas_price_data

Unnamed: 0_level_0,Polttoneste,Hinta
Kuukausi,Unnamed: 1_level_1,Unnamed: 2_level_1
1988-01-01,"Moottoribensiini 95 E 10, snt/l",56.0
1988-02-01,"Moottoribensiini 95 E 10, snt/l",56.0
1988-03-01,"Moottoribensiini 95 E 10, snt/l",56.0
1988-04-01,"Moottoribensiini 95 E 10, snt/l",56.0
1988-05-01,"Moottoribensiini 95 E 10, snt/l",56.3
...,...,...
2024-11-01,"Moottoribensiini 95 E 10, snt/l",175.0
2024-12-01,"Moottoribensiini 95 E 10, snt/l",176.0
2025-01-01,"Moottoribensiini 95 E 10, snt/l",179.0
2025-02-01,"Moottoribensiini 95 E 10, snt/l",178.0


In [34]:
gas_price_data = gas_price_data.drop(columns=['Polttoneste'])
gas_price_data.head(1)

Unnamed: 0_level_0,Hinta
Kuukausi,Unnamed: 1_level_1
1988-01-01,56.0


### Preprocessing diesel data

In [35]:
# select columns for diesel data
diesel_price_data = full_data[full_data['Polttoneste'].isin(['Dieselöljy, snt/l'])]
diesel_price_data.head(3)

Unnamed: 0_level_0,Polttoneste,Hinta
Kuukausi,Unnamed: 1_level_1,Unnamed: 2_level_1
1988-01-01,"Dieselöljy, snt/l",40.0
1988-02-01,"Dieselöljy, snt/l",40.0
1988-03-01,"Dieselöljy, snt/l",40.0


In [36]:
# remove unwanted columns
diesel_price_data = diesel_price_data.drop(columns=['Polttoneste'])
diesel_price_data.head(1)

Unnamed: 0_level_0,Hinta
Kuukausi,Unnamed: 1_level_1
1988-01-01,40.0


## Export datasets

- dataset with all content

- diesel only

- gasoline only

In [37]:
# full dataset export
full_data.to_csv('data/full_data.csv', index=True)

In [38]:
# export gasoline
gas_price_data.to_csv('data/gasoline_data.csv', index=True)

In [39]:
# export diesel
diesel_price_data.to_csv('data/diesel_data.csv', index=True)