## Turn different tabs of an excel into dataframes

In [None]:
import pandas as pd

file_path = "../../data/raw/inflation_category.xlsx"
excel_file = pd.ExcelFile(file_path)

sheet_names = excel_file.sheet_names
print("Sheet names:", sheet_names)

In [None]:
sheet_names = ['Blatt 2', 'Blatt 3', 'Blatt 4', 'Blatt 5', 'Blatt 6', 'Blatt 7', 'Blatt 8', 'Blatt 9', 'Blatt 10', 'Blatt 11', 'Blatt 12', 'Blatt 13']

dataframes = {}

for sheet_name in sheet_names:
    dataframes[sheet_name] = pd.read_excel(file_path, sheet_name=sheet_name)

food = dataframes[sheet_names[0]]
bev = dataframes[sheet_names[1]]
clothing = dataframes[sheet_names[2]]
housing = dataframes[sheet_names[3]]
furnishing = dataframes[sheet_names[4]]
health = dataframes[sheet_names[5]]
transport = dataframes[sheet_names[6]]
communications = dataframes[sheet_names[7]]
culture = dataframes[sheet_names[8]]
education = dataframes[sheet_names[9]]
restaurants = dataframes[sheet_names[10]]
misc_goods = dataframes[sheet_names[11]]

In [None]:
# Add category column for each category
food["category"] = "Food and non-alcoholic beverages"
bev["category"] = "Alcoholic beverages, tobacco and narcotics"
clothing["category"] = "Clothing and footwear"
housing["category"] = "Housing, water, electricity, gas and other fuels"
furnishing["category"] = "Furnishings, household equipment and routine household maintenance"
health["category"] = "Health"
transport["category"] = "Transport"
communications["category"] = "Communications"
culture["category"] = "Recreation and culture"
education["category"] = "Education"
restaurants["category"] = "Restaurants and hotels"
misc_goods["category"] = "Miscellaneous goods and services"
food

## Concat to one dataframe

In [None]:
category = pd.concat([food, bev, clothing, housing, furnishing, health, transport, communications, culture, education, restaurants, misc_goods], axis = 0)
category

## Data Cleaning except null-values

In [None]:
from clean_category_inf import data_clean

columns_df = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
df = category

category = data_clean(df, columns_df)
category

## Null-values

In [None]:
category.isna().sum()

## Data types

In [None]:
category.dtypes

## Stack dataframe for MySQL

In [None]:
category_final = category.set_index(["country", "category"])
category_final = category_final.stack()
category_final = pd.DataFrame(category_final)
category_final = category_final.reset_index()
category_final.columns = ["country", "category", "year", "inflation"]
category_final

In [None]:
category_final["inflation_%"] = category_final["inflation"] / 100
category_final

In [None]:
# Create column country names
category_final["country_ID"] = category_final["country"]
category_final["country"] = category_final["country"].replace({"EU":"European_Union", "BE":"Belgium", "BG":"Bulgaria", "DK":"Denmark", "DE":"Germany", "EE":"Estonia",
                                                        "FI":"Finland", "FR":"France", "GR":"Greece", "IE":"Ireland", "IT":"Italy",
                                                        "HR":"Croatia", "LV":"Latvia", "LT":"Lithuania", "LU":"Luxembourg", "MT":"Malta", "NL":"Netherlands", 
                                                        "AT":"Austria", "PL":"Poland", "PT":"Portugal", "RO":"Romania", "SE":"Sweden",
                                                        "SK":"Slovakia", "SI":"Slovenia", "ES":"Spain", "CZ":"Czech_Republic", "HU":"Hungary",
                                                        "CY":"Cyprus"}, regex=True)
category_final

In [None]:
category_final["year"] = category_final["year"].astype(int)
category_final.dtypes

In [None]:
# Create column year ID
category_final["year_ID"] = (category_final["year"] - 2000)
category_final

In [None]:
# Create column Category ID
category_final["category_ID"] = category_final["category"].replace({"Food and non-alcoholic beverages":"C01", "Alcoholic beverages, tobacco and narcotics":"C02",
                                                                    "Clothing and footwear":"C03", "Housing, water, electricity, gas and other fuels": "C04",
                                                                    "Furnishings, household equipment and routine household maintenance":"C05", "Health":"C06",
                                                                    "Transport":"C07", "Communications":"C08", "Recreation and culture":"C09", "Education":"C10",
                                                                    "Restaurants and hotels":"C11", "Insurance and Finance":"C12", "Miscellaneous goods and services":"C13"}, regex=True)
category_final

In [None]:
# Create column inflation_cat_ID
category_final["inflation_cat_ID"] = category_final["country_ID"] + category_final["year_ID"].astype(str)
category_final["inflation_cat_ID"] = category_final["inflation_cat_ID"] + "_" + category_final["category_ID"]
category_final

In [None]:
# Create file for entity "inflation_cat"
selected_columns = ['inflation_cat_ID', 'country_ID', 'year_ID', 'category_ID', 'inflation_%']

df_inf_cat = category_final[selected_columns]
df_inf_cat

In [None]:
#df_inf_cat.to_csv("../../data/database/inflation_cat.csv", index=False, encoding="utf-8", sep=";")

## Create file for cumulative inflation

In [None]:
df_inf_cat = df_inf_cat[(df_inf_cat['year_ID'] >= 20) & (df_inf_cat['year_ID'] <= 23)]
df_inf_cat

In [None]:
import numpy as np
grouped = df_inf_cat.groupby(['country_ID', 'category_ID'])
cumulative_inflation = grouped['inflation_%'].apply(lambda x: (np.prod(1 + x))-1)
cumulative_inflation_df = cumulative_inflation.reset_index()
cumulative_inflation_df.columns = ['country_ID', 'category_ID', 'cumulative_inflation_20_23']
cumulative_inflation_df

In [None]:
#cumulative_inflation_df.to_csv("../../data/database/inflation_cat_cum.csv", index=False, encoding="utf-8", sep=";")