In [5]:
import pandas as pd
import xlrd
import os
import warnings

file_paths = ["Rocznik_2014__GR.xls", "Rocznik_2015__GR.xls", "Rocznik_2016__GR.xls", "Rocznik_2017_GR.xls", "Rocznik_2018_GR.xls", "Rocznik_2019_GR.xls",
              "Rocznik_2020_GR.xls", "Rocznik_2021_GR.xls", "Rocznik_2022_GR.xls"]

# map files to tab names [tabs are mixed between years]
files_to_tab_names = dict()
headers = dict()
for ind, path in enumerate(file_paths):
  files_to_tab_names[path] = dict()
  headers[path] = dict()

  if ind <= 3:
    files_to_tab_names[path]["wartosci_akcji"] = "Tab 13"
    files_to_tab_names[path]["najwyzsze_sesyjne_obroty"] = "Tab 14"
    files_to_tab_names[path]["stopy_zwrotu"] = "Tab 15"
    files_to_tab_names[path]["najwyzsze_stopy_zwrotu"] = "Tab 16"
  else:
    files_to_tab_names[path]["wartosci_akcji"] = "Tab 8"
    files_to_tab_names[path]["najwyzsze_sesyjne_obroty"] = "Tab 9"
    files_to_tab_names[path]["stopy_zwrotu"] = "Tab 10"
    files_to_tab_names[path]["najwyzsze_stopy_zwrotu"] = "Tab 11"
  headers[path]["wartosci_akcji"] = [3]

In [31]:
def read_excel_files(path):
    excel_data = {}
    for file in os.listdir(path):
        if file.endswith(".xls"):
            year = file.split('_')[1]  # Extracting year from file name
            excel_data[year] = {}
            xls = pd.ExcelFile(os.path.join(path, file))
            for sheet_name in xls.sheet_names:

              if sheet_name == files_to_tab_names[file]["wartosci_akcji"]:
                excel_data[year]["wartosci_akcji"] = pd.read_excel(xls, sheet_name=sheet_name, header=headers[file]["wartosci_akcji"], usecols = range(11))

    print(excel_data.keys())
    return excel_data
excel_data = read_excel_files("./")

dict_keys(['2020', '2016', '2022', '2018', '2015', '2014', '2021', '2017', '2019'])


In [34]:
def separate_data_by_company(excel_data):

    company_dfs = {}
    for year, sheets in excel_data.items():
        for sheet_name, df in sheets.items():
            company_col = None
            if 'Lp./ No' in df.columns:
              df.drop(columns = ['Lp./ No'])
            if 'Unnamed: 9' in df.columns: #delete empty/irrelevent columns
              df.drop(columns = ['Unnamed: 9'])
            for col in df.columns:
                if 'Spółka/ Company' in col:
                    company_col = col
                    break
                elif 'Akcje/ Shares' in col:
                    company_col = col
                    break
                elif 'Spółka/Company' in col:
                    company_col = col
                    break
                elif 'Akcje/Shares' in col:
                    company_col = col
                    break
                elif 'Spółka / Company' in col:
                    company_col = col
                    break
                elif 'Akcje / Shares' in col:
                    company_col = col
                    break

            if company_col is None:
                raise ValueError(f"Company column not found in the DataFrame, sheet_name={sheet_name}, year = {year}")

            for company, group in df.groupby(by=company_col):
                if company not in company_dfs:
                    # Initialize DataFrame with columns corresponding to each year
                    company_dfs[company] = pd.DataFrame(columns=excel_data.keys())
                # Add data for the company and year, transposing the group
                company_dfs[company][year] = group.drop(columns=[company_col]).T
    for company, df in company_dfs.items():
        company_dfs[company] = df.sort_index(axis=1)
    return company_dfs
company_data = separate_data_by_company(excel_data)

In [36]:
print(company_data.keys())
print(company_data['08OCTAVA    '].head())

dict_keys(['06MAGNA     ', '08OCTAVA    ', '11BIT       ', '4FUNMEDIA   ', 'ABADONRE    ', 'ABCDATA     ', 'ABMSOLID    ', 'ABPL        ', 'ACAUTOGAZ   ', 'ACTION      ', 'ADIUVO      ', 'AGORA       ', 'AGROTON     ', 'AILLERON    ', 'AIRWAY      ', 'ALCHEMIA    ', 'ALIOR       ', 'ALTA        ', 'ALTUSTFI    ', 'ALUMETAL    ', 'AMBRA       ', 'AMICA       ', 'AMPLI       ', 'AMREST      ', 'APATOR      ', 'APLISENS    ', 'APSENERGY   ', 'ARCHICOM    ', 'ARCTIC      ', 'ARCUS       ', 'ARTERIA     ', 'ARTIFEX     ', 'ASBIS       ', 'ASMGROUP    ', 'ASSECOBS    ', 'ASSECOPOL   ', 'ASSECOSEE   ', 'ASTARTA     ', 'ATAL        ', 'ATENDE      ', 'ATLANTAPL   ', 'ATLANTIS    ', 'ATLASEST    ', 'ATM         ', 'ATMGRUPA    ', 'ATREM       ', 'AUGA        ', 'AUTOPARTN   ', 'AWBUD       ', 'BAHOLDING   ', 'BALTONA     ', 'BBIDEV      ', 'BEDZIN      ', 'BENEFIT     ', 'BERLING     ', 'BEST        ', 'BETACOM     ', 'BIK         ', 'BIOMEDLUB   ', 'BIOTON      ', 'BNPPPL      ', 'BOGDANKA    