In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
import os

In [27]:

def merge_dataframes(df1, df2):
    """
    Merge the first 13 columns of two DataFrames.

    Parameters:
    df1 (DataFrame): First DataFrame.
    df2 (DataFrame): Second DataFrame.

    Returns:
    DataFrame: Merged DataFrame containing the first 13 columns of each DataFrame.

    Example:
        >>> df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> df2 = pd.DataFrame({'A': [7, 8, 9], 'C': [10, 11, 12]})
        >>> merge_dataframes(df1, df2)
           A  B   C
        0  1  4  10
        1  2  5  11
        2  3  6  12
    """
    # Determine the number of columns to merge (minimum between the two DataFrames)
    for col in df2.columns:
        if df2[col].dtype == 'datetime64[ns]':
            df2[col] = df2[col].dt.strftime('%Y-%m-%d')

    for col in df1.columns:
        if df1[col].dtype == 'datetime64[ns]':
            df1[col] = df1[col].dt.strftime('%Y-%m-%d')

    common_columns = df1.columns.intersection(df2.columns)

    merged_df = df1.merge(df2, on=common_columns.tolist(), how='outer')

    return merged_df

def merge_dataframes_with_previous_files(file_paths, column_patterns):
    """
    Merge the first 13 columns of DataFrames from multiple Excel files with two sheets each.
    The column names from the first DataFrame are kept.

    Parameters:
    file_paths (list): List of file paths to Excel files.

    Returns:
    DataFrame: Merged DataFrame containing the first 13 columns of each DataFrame from all files.
    """
    merged_df = None
    merged_df_temp = None
    
    # Iterate over each file path
    for file_path in file_paths:
        print(f"\nLoading file : {file_path}")
        # Load Excel file into separate DataFrames
        # Fetch only first sheet as it is RES 
        df = pd.read_excel(file_path, header=1, sheet_name=[0], parse_dates=False)
        print(f"Number of sheets detected: {len(df.keys())}")

        # Merge in sheet level
        for sheet_name, df_sheet in df.items():
            # Filter according to pattern columns to keep
            df_sheet = df_sheet.filter(regex='|'.join(column_patterns))
            df_sheet['file'] = file_path
            print(f"Number of columns detected in {sheet_name} : {len(df_sheet.columns)}")
            if merged_df_temp is None:
                # Merge the Sheet DataFrames
                merged_df_temp = df_sheet
            else:
                merged_df_temp = merge_dataframes(merged_df_temp, df_sheet)

        # Merge in file level
        # If merged_df is None, assign it the merged DataFrame, else merge with the previous merged DataFrame
        if merged_df is None:
            merged_df = merged_df_temp
        else:
            merged_df = merge_dataframes(merged_df, merged_df_temp)

    return merged_df

### Licenses Fetch

In [28]:

# Define the file paths and column patterns
# Define the directory containing Excel files
directory = "../../data/licenses/"
# Print the current working directory
print(f"Current working directory: {os.getcwd()}")
# Get the directory of the current file
current_directory = os.getcwd()
directory = os.path.join(current_directory, directory)
file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".xlsx")]
column_patterns = ['ΑΙΤΗΣΗ', 'ΜΗΤΡΩΟ', 'ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ.*', 'ΕΤΑΙΡΕΙΑ', 'ΗΜΕΡ.*ΕΚΔ.*ΠΑΡΑΓ.*', 'ΗΜΕΡ.*ΛΗΞΗ.*ΠΑΡΑΓ.*', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟ.* ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', '.*(MW)', 'ΤΕΧΝΟΛ.*'] 

# Sort file paths based on their names to process them in order
file_paths.sort()

licenses_df = merge_dataframes_with_previous_files(file_paths, column_patterns)

Current working directory: /home/marios/projects/RAE_Forecasting/src/notebooks

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Vevaioseis-APE.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-OKTΩΒΡΙΟΣ-2023.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΙΑΝΟΥΑΡΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path


Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΙΟΥΛΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΙΟΥΝΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑIOΣ-2023.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΡΤΙΟΣ-2023.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΡΤΙΟΣ-2024.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΝΟΕΜΒΡΙΟΣ-2023.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΣΕΠΤΕΜΒΡΙΟΣ-2023-1.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΦΕΒΡΟΥΑΡΙΟΣ-2023.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΦΕΒΡΟΥΑΡΙΟΣ-2024.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΑΥΓΟΥΣΤΟΣ-2021.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΔΕΚΕΜΒΡΙΟΣ-2022.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΡΤΙΟΣ-2021-v1.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΪΟΣ-2021.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΟΚΤΩΒΡΙΟΣ-2021.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path



Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΣΕΠΤΕΜΒΡΙΟΣ-2022.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685_1.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΠΙΝΑΚΑΣ-ΑΙΤΙΟΛΟΓΗΜΕΝΩΝ-ΕΝΣΤΑΣΕΩΝ_ΑΝΑΡΤΗΣΗ_ΤΕΛΙΚΟ.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


In [20]:
licenses_df

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΜΕΓΙΣΤΗ ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,file,AΡΙΘΜΟΣ ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ,ΗΜΕΡ/ΝΙΑ ΕΚΔ. ΑΔ. ΠΑΡΑΓΩΓΗΣ,ΙΣΧΥΣ (MW)
0,514,2001-02-19 00:00:00,ΛΑΚΜΟΣ ΕΝΕΡΓΕΙΑΚΗ ΑΕ,ΑΔ-00001,2001-05-29 00:00:00,2026-05-29 00:00:00,ΗΠΕΙΡΟΥ,ΙΩΑΝΝΙΝΩΝ,ΜΕΤΣΟΒΟΥ,ΕΓΝΑΤΙΑΣ,ΓΚΟΥΡΑ ΜΙΚΡΟ ΠΕΡΙΣΤΕΡΙ,2.155,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
1,67,2001-02-13 00:00:00,ΑΡΑΜΠΑΤΖΗΣ Β.Γ. ΑΕ,ΑΔ-00002,2001-05-29 00:00:00,2026-05-29 00:00:00,ΚΕΝΤΡΙΚΗΣ ΜΑΚΕΔΟΝΙΑΣ,ΠΕΛΛΑΣ,ΑΛΜΩΠΙΑΣ,ΑΡΙΔΑΙΑΣ,ΤΟΥΠΛΙΤΣΑ ΠΡΟΜΑΧΩΝΑ,0.830,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
2,38,2001-02-08 00:00:00,ΚΑΣΤΡΙ ΕΥΒΟΙΑΣ ΑΕ,ΑΔ-00022,2001-06-18 00:00:00,2041-03-22 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ,ΚΑΣΤΡΙ,5.000,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,
3,439,2001-02-19 00:00:00,ΥΔΡΟΗΛΕΚΤΡΙΚΗ ΕΥΡΥΤΑΝΙΑΣ ΑΕ,ΑΔ-00023,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΡΥΤΑΝΙΑΣ,ΑΓΡΑΦΩΝ,ΑΓΡΑΦΩΝ,ΜΟΝΑΣΤΗΡΑΚΙ,3.000,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
4,65,2001-02-12 00:00:00,ΠΟΛΥΠΟΤΑΜΟΣ ΑΙΟΛΙΚΗ ΕΝΕΡΓΕΙΑ ΑΕ,ΑΔ-00024,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΣΤΥΡΕΩΝ,ΓΚΕΡΚΙ-ΠΥΡΓΑΡΙ ΠΟΛΥΠΟΤΑΜΟΥ,12.000,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2191143,,,GREEN VELOCITY 2,,,,ΘΕΣΣΑΛΙΑΣ,ΤΡΙΚΑΛΩΝ,,"ΑΙΘΗΚΩΝ,ΑΣΠΡΟΠΟΤΑΜΟΥ",ΝΕΡΑΙΔΑ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-26,14.4
2191144,,,GREEN VELOCITY 2,,,,ΘΕΣΣΑΛΙΑΣ,ΤΡΙΚΑΛΩΝ,,"ΑΣΠΡΟΠΟΤΑΜΟΥ,ΚΛΕΙΝΟΒΟΥ",ΚΑΛΤΣΑ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-26,14.4
2191145,,,WIND ENERGY FACILITY ΜΟΝΟΠΡΟΣΩΠΗ ΙΔΙΩΤΙΚΗ ΚΕΦΑ...,,,,ΑΝ ΜΑΚΕΔΟΝΙΑΣ ΘΡΑΚΗΣ,ΡΟΔΟΠΗΣ,,ΦΙΛΛΥΡΑΣ,ΚΑΤΩ ΒΡΑΧΟΣ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-28,3.0
2191146,,,WIND ENERGY FACILITY ΜΟΝΟΠΡΟΣΩΠΗ ΙΔΙΩΤΙΚΗ ΚΕΦΑ...,,,,Δ ΜΑΚΕΔΟΝΙΑΣ,ΚΟΖΑΝΗΣ,,ΒΛΑΣΤΗΣ,ΜΠΕΚΡΕΒΙΝΙΚΟΣ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-28,3.0


In [29]:
# Define the file paths and column patterns
# Define the directory containing Excel files
directory = "../../data/special_projects/"
# Print the current working directory
print(f"Current working directory: {os.getcwd()}")
# Get the directory of the current file
current_directory = os.getcwd()
directory = os.path.join(current_directory, directory)
file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".xlsx")]
column_patterns = ['ΑΙΤΗΣΗ', 'ΜΗΤΡΩΟ', 'ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ.*', 'ΕΤΑΙΡΕΙΑ', 'ΗΜΕΡ.*ΕΚΔ.*ΠΑΡΑΓ.*', 'ΗΜΕΡ.*ΛΗΞΗ.*ΠΑΡΑΓ.*', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟ.* ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', '.*(MW)', 'ΤΕΧΝΟΛ.*'] 

# Sort file paths based on their names to process them in order
file_paths.sort()

special_proj_df = merge_dataframes_with_previous_files(file_paths, column_patterns)

Current working directory: /home/marios/projects/RAE_Forecasting/src/notebooks

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202012.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202103.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202105.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202108.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202110.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasti

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202303.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202305.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202306.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202307.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202309.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202310.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202311.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202402.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202403.xlsx
Number of sheets detected: 1
Number of columns detected in 0 : 14


In [30]:
licenses_df.head()

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΜΕΓΙΣΤΗ ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,file,AΡΙΘΜΟΣ ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ,ΗΜΕΡ/ΝΙΑ ΕΚΔ. ΑΔ. ΠΑΡΑΓΩΓΗΣ,ΙΣΧΥΣ (MW)
0,514,2001-02-19 00:00:00,ΛΑΚΜΟΣ ΕΝΕΡΓΕΙΑΚΗ ΑΕ,ΑΔ-00001,2001-05-29 00:00:00,2026-05-29 00:00:00,ΗΠΕΙΡΟΥ,ΙΩΑΝΝΙΝΩΝ,ΜΕΤΣΟΒΟΥ,ΕΓΝΑΤΙΑΣ,ΓΚΟΥΡΑ ΜΙΚΡΟ ΠΕΡΙΣΤΕΡΙ,2.155,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
1,67,2001-02-13 00:00:00,ΑΡΑΜΠΑΤΖΗΣ Β.Γ. ΑΕ,ΑΔ-00002,2001-05-29 00:00:00,2026-05-29 00:00:00,ΚΕΝΤΡΙΚΗΣ ΜΑΚΕΔΟΝΙΑΣ,ΠΕΛΛΑΣ,ΑΛΜΩΠΙΑΣ,ΑΡΙΔΑΙΑΣ,ΤΟΥΠΛΙΤΣΑ ΠΡΟΜΑΧΩΝΑ,0.83,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
2,38,2001-02-08 00:00:00,ΚΑΣΤΡΙ ΕΥΒΟΙΑΣ ΑΕ,ΑΔ-00022,2001-06-18 00:00:00,2041-03-22 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ,ΚΑΣΤΡΙ,5.0,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,
3,439,2001-02-19 00:00:00,ΥΔΡΟΗΛΕΚΤΡΙΚΗ ΕΥΡΥΤΑΝΙΑΣ ΑΕ,ΑΔ-00023,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΡΥΤΑΝΙΑΣ,ΑΓΡΑΦΩΝ,ΑΓΡΑΦΩΝ,ΜΟΝΑΣΤΗΡΑΚΙ,3.0,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
4,65,2001-02-12 00:00:00,ΠΟΛΥΠΟΤΑΜΟΣ ΑΙΟΛΙΚΗ ΕΝΕΡΓΕΙΑ ΑΕ,ΑΔ-00024,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΣΤΥΡΕΩΝ,ΓΚΕΡΚΙ-ΠΥΡΓΑΡΙ ΠΟΛΥΠΟΤΑΜΟΥ,12.0,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,


In [31]:
special_proj_df.head()

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,file
0,B-00279,2001-02-19,ΔΕΗ ΑΝΑΝΕΩΣΙΜΕΣ ΑΕ,ΑΔ-00488,2003-03-04,2028-03-04,ΒΟΡΕΙΟΥ ΑΙΓΑΙΟΥ,ΛΕΣΒΟΥ,ΛΕΣΒΟΥ,ΠΕΤΡΑΣ,ΣΤΥΨΗ/ΠΕΡΙΟΧΗ ΑΡΓΕΝΟΥ,8.0,ΓΕΩΘΕΡΜΙΑ,/home/marios/projects/RAE_Forecasting/src/note...
1,Γ-00798,2004-11-18,ΤΕΡΝΑ ΕΝΕΡΓΕΙΑΚΗ ΑΙ ΓΙΩΡΓΗΣ ΑΕ,ΑΔ-00911,2006-03-23,2021-03-23,ΑΤΤΙΚΗΣ,ΝΗΣΩΝ,ΥΔΡΑΣ,ΥΔΡΑΣ,ΝΗΣΙΔΑ ΑΓΙΟΣ ΓΕΩΡΓΙΟΣ,45.0,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...
2,Γ-01329,2006-01-24,ΑΙΟΛΙΚΑ ΠΑΡΚΑ ΠΑΛΙΟΠΥΡΓΟΣ ΑΕ,ΑΔ-00981,2006-10-02,2021-10-02,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ & ΚΑΦΗΡΕΩΣ,ΠΑΛΙΟΠΥΡΓΟΣ,13.8,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...
3,Γ-01089,2005-07-22,ΤΕΡΝΑ ΕΝΕΡΓΕΙΑΚΗ ΑΙ ΓΙΩΡΓΗΣ ΑΕ,ΑΔ-00989,2006-10-16,2021-10-16,ΑΤΤΙΚΗΣ,ΝΗΣΩΝ,ΥΔΡΑΣ,ΥΔΡΑΣ,ΝΗΣΙΔΑ ΑΓΙΟΣ ΓΕΩΡΓΙΟΣ,24.0,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...
4,Γ-01328,2006-01-24,ΑΙΟΛΙΚΑ ΠΑΡΚΑ ΠΛΑΤΑΝΟΣ ΜΟΝΟΠΡΟΣΩΠΗ ΑΕ,ΑΔ-01257,2009-06-16,2034-06-16,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΜΑΡΜΑΡΙΟΥ,ΠΛΑΤΑΝΟΣ,13.8,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...


In [32]:
licenses_df['file'].str.split('/').str[-1].str.split('.').str[0]

0                                            Vevaioseis-APE
1                                            Vevaioseis-APE
2                                            Vevaioseis-APE
3                                            Vevaioseis-APE
4                                            Vevaioseis-APE
                                 ...                       
2191143    ΠΙΝΑΚΑΣ-ΑΙΤΙΟΛΟΓΗΜΕΝΩΝ-ΕΝΣΤΑΣΕΩΝ_ΑΝΑΡΤΗΣΗ_ΤΕΛΙΚΟ
2191144    ΠΙΝΑΚΑΣ-ΑΙΤΙΟΛΟΓΗΜΕΝΩΝ-ΕΝΣΤΑΣΕΩΝ_ΑΝΑΡΤΗΣΗ_ΤΕΛΙΚΟ
2191145    ΠΙΝΑΚΑΣ-ΑΙΤΙΟΛΟΓΗΜΕΝΩΝ-ΕΝΣΤΑΣΕΩΝ_ΑΝΑΡΤΗΣΗ_ΤΕΛΙΚΟ
2191146    ΠΙΝΑΚΑΣ-ΑΙΤΙΟΛΟΓΗΜΕΝΩΝ-ΕΝΣΤΑΣΕΩΝ_ΑΝΑΡΤΗΣΗ_ΤΕΛΙΚΟ
2191147    ΠΙΝΑΚΑΣ-ΑΙΤΙΟΛΟΓΗΜΕΝΩΝ-ΕΝΣΤΑΣΕΩΝ_ΑΝΑΡΤΗΣΗ_ΤΕΛΙΚΟ
Name: file, Length: 2191148, dtype: object

In [None]:

# # Define the directory paths and column patterns
# directory_paths = ["../data/licenses/", "../data/contracts/"]
# column_patterns = ['ΑΙΤΗΣΗ', 'ΜΗΤΡΩΟ', 'ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ.*', 'ΕΤΑΙΡΕΙΑ', 'ΗΜΕΡ.*ΕΚΔ.*ΠΑΡΑΓ.*', 'ΗΜΕΡ.*ΛΗΞΗ.*ΠΑΡΑΓ.*', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟ.* ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', '.*(MW)', 'ΤΕΧΝΟΛ.*'] 

# # Iterate over each directory path
# for directory_path in directory_paths:
#     # Print the current working directory
#     print(f"Current working directory: {os.getcwd()}")
#     # Get the directory of the current file
#     current_directory = os.path.dirname(os.path.abspath(__file__))
#     # Define the directory containing Excel files
#     directory = os.path.join(current_directory, directory_path)
#     file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".xlsx")]

# # Sort file paths based on their names to process them in order
# file_paths.sort()

# merged_df = merge_dataframes_with_previous_files(file_paths, column_patterns)
