In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from pandas.tseries.offsets import DateOffset

In [2]:

def merge_dataframes(df1, df2):
    """
    Merge the first 13 columns of two DataFrames.

    Parameters:
    df1 (DataFrame): First DataFrame.
    df2 (DataFrame): Second DataFrame.

    Returns:
    DataFrame: Merged DataFrame containing the first 13 columns of each DataFrame.

    Example:
        >>> df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
        >>> df2 = pd.DataFrame({'A': [7, 8, 9], 'C': [10, 11, 12]})
        >>> merge_dataframes(df1, df2)
           A  B   C
        0  1  4  10
        1  2  5  11
        2  3  6  12
    """
    # Determine the number of columns to merge (minimum between the two DataFrames)
    for col in df2.columns:
        if df2[col].dtype == 'datetime64[ns]':
            df2[col] = df2[col].dt.strftime('%Y-%m-%d')

    for col in df1.columns:
        if df1[col].dtype == 'datetime64[ns]':
            df1[col] = df1[col].dt.strftime('%Y-%m-%d')

    common_columns = df1.columns.intersection(df2.columns)

    merged_df = df1.merge(df2, on=common_columns.tolist(), how='outer')

    return merged_df

def merge_dataframes_with_previous_files(file_paths, column_patterns):
    """
    Merge the first 13 columns of DataFrames from multiple Excel files with two sheets each.
    The column names from the first DataFrame are kept.

    Parameters:
    file_paths (list): List of file paths to Excel files.

    Returns:
    DataFrame: Merged DataFrame containing the first 13 columns of each DataFrame from all files.
    """
    merged_df = None
    

    # Iterate over each file path
    for file_path in file_paths:
        merged_df_temp = None
        print(f"\nLoading file : {file_path}")
        # Load Excel file into separate DataFrames
        # Fetch only first sheet as it is RES 
        df = pd.read_excel(file_path, header=1, sheet_name=[0], parse_dates=False)
        print(f"Number of sheets detected: {len(df.keys())}")

        # Merge in sheet level
        for sheet_name, df_sheet in df.items():
            print(f"Number of rows detected: {len(df_sheet)}")
            # Filter according to pattern columns to keep
            df_sheet = df_sheet.filter(regex='|'.join(column_patterns))
            df_sheet['file'] = file_path
            print(f"Number of columns detected in {sheet_name} : {len(df_sheet.columns)}")
            if merged_df_temp is None:
                # Merge the Sheet DataFrames
                merged_df_temp = df_sheet
            else:
                merged_df_temp = merge_dataframes(merged_df_temp, df_sheet)

        # Merge in file level
        # If merged_df is None, assign it the merged DataFrame, else merge with the previous merged DataFrame
        if merged_df is None:
            merged_df = merged_df_temp
        else:
            merged_df = merge_dataframes(merged_df, merged_df_temp)
        
        print(f"Number of rows detected after merge: {len(merged_df)}")

    return merged_df

def fillna_column(df, column_name, fillna_column_name):
    """
    Fill missing values in a column by using values from another column.

    Args:
        df (pandas.DataFrame): The DataFrame containing the columns.
        column_name (str): The name of the column to fill missing values.
        fillna_column_name (str): The name of the column to use for filling missing values.

    Returns:
        pandas.DataFrame: The DataFrame with missing values filled in the specified column.
    """
    df[column_name] = df[column_name].fillna(df[fillna_column_name])
    df.drop(fillna_column_name, axis=1, inplace=True)
    return df

def convert_to_datetime(df, columns=None):
    """
    Convert date-like columns in a DataFrame to datetime type.

    Parameters:
    df (DataFrame): Input DataFrame.
    columns (list of str, optional): List of columns to convert. 
                                     If None, attempts to convert all object type columns.

    Returns:
    DataFrame: DataFrame with date-like columns converted to datetime type.
    """
    # Copy the DataFrame to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # If no specific columns are provided, select all object type columns
    if columns is None:
        columns = df_copy.columns[df_copy.dtypes == 'object']
    
    # Iterate through each specified column
    for col in columns:
        print(f"Attempting column: {col}")
        # Check if the column can be converted to datetime
        try:
            df_copy[col] = pd.to_datetime(df_copy[col], errors='coerce')
            print(f"Column {col} transformed successfully.\n")
        except Exception as e:
            print(f"Column {col} is incompatible with datetime: {e}\n")
            # If conversion fails, skip to the next column
            continue
            
    return df_copy


def calculate_rsi(df, group_by_col, type_col='ΤΕΧΝΟΛΟΓΙΑ'):
    """
    Calculate the Regional Specialization Index (RSI) for a specified column.
    
    Parameters:
    df (pd.DataFrame): The input dataframe containing the data.
    group_by_col (str): The column to group by (e.g., 'Region', 'Company').
    type_col (str): The column indicating the type of RES (default is 'Technology').
    
    Returns:
    pd.DataFrame: A dataframe with RSI values.
    """
    # Count permits by type and the specified group
    permits_by_type_group = df.groupby([group_by_col, type_col]).size().reset_index(name='ΑΔΕΙΕΣ')

    # Count total permits by the specified group
    total_permits_by_group = df.groupby(group_by_col).size().reset_index(name='Total Permits in Group')

    # Merge to get P_ir and P_r
    permits_by_type_group = permits_by_type_group.merge(total_permits_by_group, on=group_by_col)

    # Count total permits by type nationally
    permits_by_type_nation = df.groupby(type_col).size().reset_index(name='Total Permits by Type')

    # Count total permits nationally
    total_permits_nation = len(df)

    # Merge to get P_in and P_n
    permits_by_type_group = permits_by_type_group.merge(permits_by_type_nation, on=type_col)

    # Calculate RSI
    permits_by_type_group['RSI'] = (permits_by_type_group['ΑΔΕΙΕΣ'] / permits_by_type_group['Total Permits in Group']) / (permits_by_type_group['Total Permits by Type'] / total_permits_nation)

    return permits_by_type_group[[group_by_col, type_col, 'RSI']]


def add_rsi(df, groups=['ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟΣ ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ']):
    """
    Add the Regional Specialization Index (RSI) to a DataFrame.

    Parameters:
    df (pd.DataFrame): Input DataFrame.
    groups (list of str, optional): List of columns to group by.

    Returns:
    pd.DataFrame: DataFrame with RSI values added.
    """
    # Copy the DataFrame to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Calculate and add RSI for each group
    for group in groups:
        rsi_df = calculate_rsi(df_copy, group)
        # Merge the RSI values with the original DataFrame
        df_copy = df_copy.merge(rsi_df, on=[group, 'ΤΕΧΝΟΛΟΓΙΑ'], how='left', suffixes=('', f'_{group}'))
    
    return df_copy

### Licenses Fetch

In [3]:

# Define the file paths and column patterns
# Define the directory containing Excel files
directory = "../../data/licenses/"
# Print the current working directory
print(f"Current working directory: {os.getcwd()}")
# Get the directory of the current file
current_directory = os.getcwd()
directory = os.path.join(current_directory, directory)
file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".xlsx")]
column_patterns = ['ΑΙΤΗΣΗ', 'ΜΗΤΡΩΟ', 'ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ.*', 'ΕΤΑΙΡΕΙΑ', 'ΗΜΕΡ.*ΕΚΔ.*ΠΑΡΑΓ.*', 'ΗΜΕΡ.*ΛΗΞΗ.*ΠΑΡΑΓ.*', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟ.* ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', '.*(MW)', 'ΤΕΧΝΟΛ.*'] 

# Sort file paths based on their names to process them in order
file_paths.sort()

licenses_df = merge_dataframes_with_previous_files(file_paths, column_patterns)

Current working directory: /home/marios/projects/RAE_Forecasting/src/notebooks

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Vevaioseis-APE.xlsx
Number of sheets detected: 1
Number of rows detected: 2840
Number of columns detected in 0 : 14
Number of rows detected after merge: 2840

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-OKTΩΒΡΙΟΣ-2023.xlsx
Number of sheets detected: 1
Number of rows detected: 5469
Number of columns detected in 0 : 14
Number of rows detected after merge: 8309

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΙΑΝΟΥΑΡΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path


Number of sheets detected: 1
Number of rows detected: 5231
Number of columns detected in 0 : 14
Number of rows detected after merge: 13540

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΙΟΥΛΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 5403
Number of columns detected in 0 : 14
Number of rows detected after merge: 18943

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΙΟΥΝΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 5390
Number of columns detected in 0 : 14
Number of rows detected after merge: 24333

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑIOΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 5336
Number of columns detected in 0 : 14
Number of rows detected after merge: 29669

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΡΤΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 5316
Number of columns detected in 0 : 14
Number of rows detected after merge: 34985

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΡΤΙΟΣ-2024.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 5211
Number of columns detected in 0 : 14
Number of rows detected after merge: 40196

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΝΟΕΜΒΡΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 5410
Number of columns detected in 0 : 14
Number of rows detected after merge: 45606

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΣΕΠΤΕΜΒΡΙΟΣ-2023-1.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path


Number of sheets detected: 1
Number of rows detected: 5395
Number of columns detected in 0 : 14
Number of rows detected after merge: 51001

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΦΕΒΡΟΥΑΡΙΟΣ-2023.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path


Number of sheets detected: 1
Number of rows detected: 5250
Number of columns detected in 0 : 13
Number of rows detected after merge: 56251

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΦΕΒΡΟΥΑΡΙΟΣ-2024.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 5333
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of rows detected after merge: 61584

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΑΥΓΟΥΣΤΟΣ-2021.xlsx
Number of sheets detected: 1
Number of rows detected: 4839
Number of columns detected in 0 : 14
Number of rows detected after merge: 66423

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΔΕΚΕΜΒΡΙΟΣ-2022.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path


Number of sheets detected: 1
Number of rows detected: 4819
Number of columns detected in 0 : 14
Number of rows detected after merge: 71242

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΡΤΙΟΣ-2021-v1.xlsx
Number of sheets detected: 1
Number of rows detected: 3324
Number of columns detected in 0 : 14
Number of rows detected after merge: 74566

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΜΑΪΟΣ-2021.xlsx
Number of sheets detected: 1
Number of rows detected: 4560
Number of columns detected in 0 : 14
Number of rows detected after merge: 79126

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΟΚΤΩΒΡΙΟΣ-2021.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path


Number of sheets detected: 1
Number of rows detected: 5184
Number of columns detected in 0 : 14


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path


Number of rows detected after merge: 84310

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685-ΜΗΤΡΩΟ-ΑΔΕΙΩΝ-ΕΡΓΑ-ΑΠΕ-ΣΕΠΤΕΜΒΡΙΟΣ-2022.xlsx
Number of sheets detected: 1
Number of rows detected: 4819
Number of columns detected in 0 : 14
Number of rows detected after merge: 89129

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/Ν.4685_1.xlsx
Number of sheets detected: 1
Number of rows detected: 5210
Number of columns detected in 0 : 14
Number of rows detected after merge: 94339

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/licenses/ΠΙΝΑΚΑΣ-ΑΙΤΙΟΛΟΓΗΜΕΝΩΝ-ΕΝΣΤΑΣΕΩΝ_ΑΝΑΡΤΗΣΗ_ΤΕΛΙΚΟ.xlsx
Number of sheets detected: 1
Number of rows detected: 61
Number of columns detected in 0 : 9
Number of rows detected after merge: 94400


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


In [4]:
licenses_df

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΜΕΓΙΣΤΗ ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,file,AΡΙΘΜΟΣ ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ,ΗΜΕΡ/ΝΙΑ ΕΚΔ. ΑΔ. ΠΑΡΑΓΩΓΗΣ,ΙΣΧΥΣ (MW)
0,514,2001-02-19 00:00:00,ΛΑΚΜΟΣ ΕΝΕΡΓΕΙΑΚΗ ΑΕ,ΑΔ-00001,2001-05-29 00:00:00,2026-05-29 00:00:00,ΗΠΕΙΡΟΥ,ΙΩΑΝΝΙΝΩΝ,ΜΕΤΣΟΒΟΥ,ΕΓΝΑΤΙΑΣ,ΓΚΟΥΡΑ ΜΙΚΡΟ ΠΕΡΙΣΤΕΡΙ,2.155,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
1,67,2001-02-13 00:00:00,ΑΡΑΜΠΑΤΖΗΣ Β.Γ. ΑΕ,ΑΔ-00002,2001-05-29 00:00:00,2026-05-29 00:00:00,ΚΕΝΤΡΙΚΗΣ ΜΑΚΕΔΟΝΙΑΣ,ΠΕΛΛΑΣ,ΑΛΜΩΠΙΑΣ,ΑΡΙΔΑΙΑΣ,ΤΟΥΠΛΙΤΣΑ ΠΡΟΜΑΧΩΝΑ,0.830,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
2,38,2001-02-08 00:00:00,ΚΑΣΤΡΙ ΕΥΒΟΙΑΣ ΑΕ,ΑΔ-00022,2001-06-18 00:00:00,2041-03-22 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ,ΚΑΣΤΡΙ,5.000,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,
3,439,2001-02-19 00:00:00,ΥΔΡΟΗΛΕΚΤΡΙΚΗ ΕΥΡΥΤΑΝΙΑΣ ΑΕ,ΑΔ-00023,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΡΥΤΑΝΙΑΣ,ΑΓΡΑΦΩΝ,ΑΓΡΑΦΩΝ,ΜΟΝΑΣΤΗΡΑΚΙ,3.000,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
4,65,2001-02-12 00:00:00,ΠΟΛΥΠΟΤΑΜΟΣ ΑΙΟΛΙΚΗ ΕΝΕΡΓΕΙΑ ΑΕ,ΑΔ-00024,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΣΤΥΡΕΩΝ,ΓΚΕΡΚΙ-ΠΥΡΓΑΡΙ ΠΟΛΥΠΟΤΑΜΟΥ,12.000,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94395,,,GREEN VELOCITY 2,,,,ΘΕΣΣΑΛΙΑΣ,ΤΡΙΚΑΛΩΝ,,"ΑΙΘΗΚΩΝ,ΑΣΠΡΟΠΟΤΑΜΟΥ",ΝΕΡΑΙΔΑ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-26,14.4
94396,,,GREEN VELOCITY 2,,,,ΘΕΣΣΑΛΙΑΣ,ΤΡΙΚΑΛΩΝ,,"ΑΣΠΡΟΠΟΤΑΜΟΥ,ΚΛΕΙΝΟΒΟΥ",ΚΑΛΤΣΑ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-26,14.4
94397,,,WIND ENERGY FACILITY ΜΟΝΟΠΡΟΣΩΠΗ ΙΔΙΩΤΙΚΗ ΚΕΦΑ...,,,,ΑΝ ΜΑΚΕΔΟΝΙΑΣ ΘΡΑΚΗΣ,ΡΟΔΟΠΗΣ,,ΦΙΛΛΥΡΑΣ,ΚΑΤΩ ΒΡΑΧΟΣ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-28,3.0
94398,,,WIND ENERGY FACILITY ΜΟΝΟΠΡΟΣΩΠΗ ΙΔΙΩΤΙΚΗ ΚΕΦΑ...,,,,Δ ΜΑΚΕΔΟΝΙΑΣ,ΚΟΖΑΝΗΣ,,ΒΛΑΣΤΗΣ,ΜΠΕΚΡΕΒΙΝΙΚΟΣ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-28,3.0


In [5]:
# Define the file paths and column patterns
# Define the directory containing Excel files
directory = "../../data/special_projects/"
# Print the current working directory
print(f"Current working directory: {os.getcwd()}")
# Get the directory of the current file
current_directory = os.getcwd()
directory = os.path.join(current_directory, directory)
file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".xlsx")]
column_patterns = ['ΑΙΤΗΣΗ', 'ΜΗΤΡΩΟ', 'ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ.*', 'ΕΤΑΙΡΕΙΑ', 'ΗΜΕΡ.*ΕΚΔ.*ΠΑΡΑΓ.*', 'ΗΜΕΡ.*ΛΗΞΗ.*ΠΑΡΑΓ.*', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟ.* ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', '.*(MW)', 'ΤΕΧΝΟΛ.*'] 

# Sort file paths based on their names to process them in order
file_paths.sort()

special_proj_df = merge_dataframes_with_previous_files(file_paths, column_patterns)

Current working directory: /home/marios/projects/RAE_Forecasting/src/notebooks

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202012.xlsx
Number of sheets detected: 1
Number of rows detected: 186
Number of columns detected in 0 : 14
Number of rows detected after merge: 186

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202103.xlsx
Number of sheets detected: 1
Number of rows detected: 185
Number of columns detected in 0 : 14
Number of rows detected after merge: 371

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202105.xlsx
Number of sheets detected: 1
Number of rows detected: 184
Number of columns detected in 0 : 14
Number of rows detected after merge: 555

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202108.xlsx
Number of sheets detected: 1
Number of rows detected: 184
Number of columns detec

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Number of sheets detected: 1
Number of rows detected: 206
Number of columns detected in 0 : 14
Number of rows detected after merge: 1948

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202303.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Number of sheets detected: 1
Number of rows detected: 206
Number of columns detected in 0 : 14
Number of rows detected after merge: 2154

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202305.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 206
Number of columns detected in 0 : 14
Number of rows detected after merge: 2360

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202306.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 204
Number of columns detected in 0 : 14
Number of rows detected after merge: 2564

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202307.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')


Number of sheets detected: 1
Number of rows detected: 204
Number of columns detected in 0 : 14
Number of rows detected after merge: 2768

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202309.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sheet['file'] = file_path
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[col] = df2[col].dt.strftime('%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Number of sheets detected: 1
Number of rows detected: 207
Number of columns detected in 0 : 14
Number of rows detected after merge: 2975

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202310.xlsx
Number of sheets detected: 1
Number of rows detected: 207
Number of columns detected in 0 : 14
Number of rows detected after merge: 3182

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202311.xlsx
Number of sheets detected: 1
Number of rows detected: 201
Number of columns detected in 0 : 14
Number of rows detected after merge: 3383

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202402.xlsx
Number of sheets detected: 1
Number of rows detected: 200
Number of columns detected in 0 : 14
Number of rows detected after merge: 3583

Loading file : /home/marios/projects/RAE_Forecasting/src/notebooks/../../data/special_projects/202403.xlsx
Number of sheets det

In [6]:
orig_licenses_df = licenses_df.copy()

In [7]:
licenses_df

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΜΕΓΙΣΤΗ ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,file,AΡΙΘΜΟΣ ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ,ΗΜΕΡ/ΝΙΑ ΕΚΔ. ΑΔ. ΠΑΡΑΓΩΓΗΣ,ΙΣΧΥΣ (MW)
0,514,2001-02-19 00:00:00,ΛΑΚΜΟΣ ΕΝΕΡΓΕΙΑΚΗ ΑΕ,ΑΔ-00001,2001-05-29 00:00:00,2026-05-29 00:00:00,ΗΠΕΙΡΟΥ,ΙΩΑΝΝΙΝΩΝ,ΜΕΤΣΟΒΟΥ,ΕΓΝΑΤΙΑΣ,ΓΚΟΥΡΑ ΜΙΚΡΟ ΠΕΡΙΣΤΕΡΙ,2.155,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
1,67,2001-02-13 00:00:00,ΑΡΑΜΠΑΤΖΗΣ Β.Γ. ΑΕ,ΑΔ-00002,2001-05-29 00:00:00,2026-05-29 00:00:00,ΚΕΝΤΡΙΚΗΣ ΜΑΚΕΔΟΝΙΑΣ,ΠΕΛΛΑΣ,ΑΛΜΩΠΙΑΣ,ΑΡΙΔΑΙΑΣ,ΤΟΥΠΛΙΤΣΑ ΠΡΟΜΑΧΩΝΑ,0.830,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
2,38,2001-02-08 00:00:00,ΚΑΣΤΡΙ ΕΥΒΟΙΑΣ ΑΕ,ΑΔ-00022,2001-06-18 00:00:00,2041-03-22 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ,ΚΑΣΤΡΙ,5.000,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,
3,439,2001-02-19 00:00:00,ΥΔΡΟΗΛΕΚΤΡΙΚΗ ΕΥΡΥΤΑΝΙΑΣ ΑΕ,ΑΔ-00023,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΡΥΤΑΝΙΑΣ,ΑΓΡΑΦΩΝ,ΑΓΡΑΦΩΝ,ΜΟΝΑΣΤΗΡΑΚΙ,3.000,ΜΥΗΕ,/home/marios/projects/RAE_Forecasting/src/note...,,,
4,65,2001-02-12 00:00:00,ΠΟΛΥΠΟΤΑΜΟΣ ΑΙΟΛΙΚΗ ΕΝΕΡΓΕΙΑ ΑΕ,ΑΔ-00024,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΣΤΥΡΕΩΝ,ΓΚΕΡΚΙ-ΠΥΡΓΑΡΙ ΠΟΛΥΠΟΤΑΜΟΥ,12.000,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94395,,,GREEN VELOCITY 2,,,,ΘΕΣΣΑΛΙΑΣ,ΤΡΙΚΑΛΩΝ,,"ΑΙΘΗΚΩΝ,ΑΣΠΡΟΠΟΤΑΜΟΥ",ΝΕΡΑΙΔΑ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-26,14.4
94396,,,GREEN VELOCITY 2,,,,ΘΕΣΣΑΛΙΑΣ,ΤΡΙΚΑΛΩΝ,,"ΑΣΠΡΟΠΟΤΑΜΟΥ,ΚΛΕΙΝΟΒΟΥ",ΚΑΛΤΣΑ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-26,14.4
94397,,,WIND ENERGY FACILITY ΜΟΝΟΠΡΟΣΩΠΗ ΙΔΙΩΤΙΚΗ ΚΕΦΑ...,,,,ΑΝ ΜΑΚΕΔΟΝΙΑΣ ΘΡΑΚΗΣ,ΡΟΔΟΠΗΣ,,ΦΙΛΛΥΡΑΣ,ΚΑΤΩ ΒΡΑΧΟΣ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-28,3.0
94398,,,WIND ENERGY FACILITY ΜΟΝΟΠΡΟΣΩΠΗ ΙΔΙΩΤΙΚΗ ΚΕΦΑ...,,,,Δ ΜΑΚΕΔΟΝΙΑΣ,ΚΟΖΑΝΗΣ,,ΒΛΑΣΤΗΣ,ΜΠΕΚΡΕΒΙΝΙΚΟΣ,,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...,,2020-09-28,3.0


In [8]:
licenses_df = fillna_column(licenses_df, 'ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ', 'AΡΙΘΜΟΣ ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ')
licenses_df = fillna_column(licenses_df, 'ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ', 'ΗΜΕΡ/ΝΙΑ ΕΚΔ. ΑΔ. ΠΑΡΑΓΩΓΗΣ')
licenses_df = fillna_column(licenses_df, 'ΜΕΓΙΣΤΗ ΙΣΧΥΣ (MW)', 'ΙΣΧΥΣ (MW)')

licenses_df.columns = special_proj_df.columns

In [9]:
licenses_df.isna().sum()

ΑΙΤΗΣΗ                             62
ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ        79
ΕΤΑΙΡΕΙΑ                            4
ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ           5315
ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ        4
ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ      65
ΠΕΡΙΦΕΡΕΙΑ                          4
ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ                4
ΔΗΜΟΣ                              65
ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ                   12
ΘΕΣΗ                                4
ΙΣΧΥΣ (MW)                          4
ΤΕΧΝΟΛΟΓΙΑ                          4
file                                0
dtype: int64

In [10]:
special_proj_df.head()

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,file
0,B-00279,2001-02-19,ΔΕΗ ΑΝΑΝΕΩΣΙΜΕΣ ΑΕ,ΑΔ-00488,2003-03-04,2028-03-04,ΒΟΡΕΙΟΥ ΑΙΓΑΙΟΥ,ΛΕΣΒΟΥ,ΛΕΣΒΟΥ,ΠΕΤΡΑΣ,ΣΤΥΨΗ/ΠΕΡΙΟΧΗ ΑΡΓΕΝΟΥ,8.0,ΓΕΩΘΕΡΜΙΑ,/home/marios/projects/RAE_Forecasting/src/note...
1,Γ-00798,2004-11-18,ΤΕΡΝΑ ΕΝΕΡΓΕΙΑΚΗ ΑΙ ΓΙΩΡΓΗΣ ΑΕ,ΑΔ-00911,2006-03-23,2021-03-23,ΑΤΤΙΚΗΣ,ΝΗΣΩΝ,ΥΔΡΑΣ,ΥΔΡΑΣ,ΝΗΣΙΔΑ ΑΓΙΟΣ ΓΕΩΡΓΙΟΣ,45.0,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...
2,Γ-01329,2006-01-24,ΑΙΟΛΙΚΑ ΠΑΡΚΑ ΠΑΛΙΟΠΥΡΓΟΣ ΑΕ,ΑΔ-00981,2006-10-02,2021-10-02,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ & ΚΑΦΗΡΕΩΣ,ΠΑΛΙΟΠΥΡΓΟΣ,13.8,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...
3,Γ-01089,2005-07-22,ΤΕΡΝΑ ΕΝΕΡΓΕΙΑΚΗ ΑΙ ΓΙΩΡΓΗΣ ΑΕ,ΑΔ-00989,2006-10-16,2021-10-16,ΑΤΤΙΚΗΣ,ΝΗΣΩΝ,ΥΔΡΑΣ,ΥΔΡΑΣ,ΝΗΣΙΔΑ ΑΓΙΟΣ ΓΕΩΡΓΙΟΣ,24.0,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...
4,Γ-01328,2006-01-24,ΑΙΟΛΙΚΑ ΠΑΡΚΑ ΠΛΑΤΑΝΟΣ ΜΟΝΟΠΡΟΣΩΠΗ ΑΕ,ΑΔ-01257,2009-06-16,2034-06-16,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΜΑΡΜΑΡΙΟΥ,ΠΛΑΤΑΝΟΣ,13.8,ΑΙΟΛΙΚΑ,/home/marios/projects/RAE_Forecasting/src/note...


In [11]:
all_ape_data = pd.concat([licenses_df, special_proj_df], ignore_index=True)
all_ape_data['file'] = all_ape_data['file'].str.split('/').str[-1].str.split('.').str[0]

In [12]:
all_ape_data

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,file
0,514,2001-02-19 00:00:00,ΛΑΚΜΟΣ ΕΝΕΡΓΕΙΑΚΗ ΑΕ,ΑΔ-00001,2001-05-29 00:00:00,2026-05-29 00:00:00,ΗΠΕΙΡΟΥ,ΙΩΑΝΝΙΝΩΝ,ΜΕΤΣΟΒΟΥ,ΕΓΝΑΤΙΑΣ,ΓΚΟΥΡΑ ΜΙΚΡΟ ΠΕΡΙΣΤΕΡΙ,2.155,ΜΥΗΕ,Vevaioseis-APE
1,67,2001-02-13 00:00:00,ΑΡΑΜΠΑΤΖΗΣ Β.Γ. ΑΕ,ΑΔ-00002,2001-05-29 00:00:00,2026-05-29 00:00:00,ΚΕΝΤΡΙΚΗΣ ΜΑΚΕΔΟΝΙΑΣ,ΠΕΛΛΑΣ,ΑΛΜΩΠΙΑΣ,ΑΡΙΔΑΙΑΣ,ΤΟΥΠΛΙΤΣΑ ΠΡΟΜΑΧΩΝΑ,0.830,ΜΥΗΕ,Vevaioseis-APE
2,38,2001-02-08 00:00:00,ΚΑΣΤΡΙ ΕΥΒΟΙΑΣ ΑΕ,ΑΔ-00022,2001-06-18 00:00:00,2041-03-22 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ,ΚΑΣΤΡΙ,5.000,ΑΙΟΛΙΚΑ,Vevaioseis-APE
3,439,2001-02-19 00:00:00,ΥΔΡΟΗΛΕΚΤΡΙΚΗ ΕΥΡΥΤΑΝΙΑΣ ΑΕ,ΑΔ-00023,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΡΥΤΑΝΙΑΣ,ΑΓΡΑΦΩΝ,ΑΓΡΑΦΩΝ,ΜΟΝΑΣΤΗΡΑΚΙ,3.000,ΜΥΗΕ,Vevaioseis-APE
4,65,2001-02-12 00:00:00,ΠΟΛΥΠΟΤΑΜΟΣ ΑΙΟΛΙΚΗ ΕΝΕΡΓΕΙΑ ΑΕ,ΑΔ-00024,2001-06-18 00:00:00,2026-06-18 00:00:00,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΣΤΥΡΕΩΝ,ΓΚΕΡΚΙ-ΠΥΡΓΑΡΙ ΠΟΛΥΠΟΤΑΜΟΥ,12.000,ΑΙΟΛΙΚΑ,Vevaioseis-APE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98176,Γ-011373_Α,2020-12-16,ΑΙΟΛΙΚΗ ΘΕΟΔΩΡΩΝ ΑΝΩΝΥΜΗ ΒΙΟΜΗΧΑΝΙΚΗ ΚΑΙ ΕΝΕΡ...,ΑΔ-04888,2022-11-04,2047-11-04,ΠΕΛΟΠΟΝΝΗΣΟΥ,ΑΡΚΑΔΙΑΣ,"ΜΕΓΑΛΟΠΟΛΗΣ,ΓΟΡΤΥΝΙΑΣ,ΤΡΙΠΟΛΗΣ","ΜΕΓΑΛΟΠΟΛΗΣ,ΤΡΙΚΟΛΩΝΩΝ,ΦΑΛΑΝΘΟΥ","ΜΠΛΕΣΙΒΟΣ, ΨΑΡΟΒΟΥΝΙ, ΛΙΜΝΕΣ ΡΟΔΙΑΣ, ΔΙΑΚΟΠΙ, ...",187.110,ΑΙΟΛΙΚΑ,202403
98177,Γ-011380_Α,2020-12-16,ΑΙΟΛΙΚΗ ΘΕΟΔΩΡΩΝ ΑΝΩΝΥΜΗ ΒΙΟΜΗΧΑΝΙΚΗ ΚΑΙ ΕΝΕΡ...,ΑΔ-04889,2022-11-04,2047-11-04,"ΔΥΤΙΚΗΣ ΕΛΛΑΔΑΣ,ΠΕΛΟΠΟΝΝΗΣΟΥ","ΗΛΕΙΑΣ,ΑΡΚΑΔΙΑΣ,ΜΕΣΣΗΝΙΑΣ","ΑΝΔΡΙΤΣΑΙΝΑΣ - ΚΡΕΣΤΕΝΩΝ,ΜΕΓΑΛΟΠΟΛΗΣ,ΟΙΧΑΛΙΑΣ,...","ΑΛΙΦΕΙΡΑΣ,ΑΝΔΡΙΤΣΑΙΝΗΣ,ΓΟΡΤΥΝΟΣ,ΕΙΡΑΣ,ΖΑΧΑΡΩΣ,...","ΓΥΜΝΟΡΡΑΧΗ, ΡΟΣΚΙΟ, ΜΠΡΙΝΙΑ, ΜΠΑΝΤΑΒΑΣ, ΚΟΥΚΟΥ...",417.600,ΑΙΟΛΙΚΑ,202403
98178,Γ-011383_Α,2020-12-16,ΑΙΟΛΙΚΗ ΘΕΟΔΩΡΩΝ ΑΝΩΝΥΜΗ ΒΙΟΜΗΧΑΝΙΚΗ ΚΑΙ ΕΝΕΡ...,ΑΔ-04890,2022-11-04,2047-11-04,"ΠΕΛΟΠΟΝΝΗΣΟΥ,ΔΥΤΙΚΗΣ ΕΛΛΑΔΑΣ","ΑΡΚΑΔΙΑΣ,ΑΧΑΪΑΣ,ΚΟΡΙΝΘΙΑΣ","ΓΟΡΤΥΝΙΑΣ,ΚΑΛΑΒΡΥΤΩΝ,ΤΡΙΠΟΛΗΣ,ΕΡΥΜΑΝΘΟΥ,ΣΙΚΥΩΝΙΩΝ","ΒΥΤΙΝΑΣ,ΚΑΛΑΒΡΥΤΩΝ,ΚΛΕΙΤΟΡΙΑΣ, ΚΛΕΙΤΟΡΟΣ,ΚΟΝ...","ΛΑΚΩΜΑΤΑ, ΠΑΝΑΓΙΑ, ΣΑΙΤΑΣ, ΔΡΑΚΟΒΟΥΝΙ, ΒΥΘΟΥΛΑ...",280.665,ΑΙΟΛΙΚΑ,202403
98179,Γ-013272,2021-06-09,VOLTON ΕΛΛΗΝΙΚΗ ΕΝΕΡΓΕΙΑΚΗ ΑΝΩΝΥΜΗ ΕΤΑΙΡΕΙΑ,ΑΔ-04891,2022-11-04,2047-11-04,ΠΕΛΟΠΟΝΝΗΣΟΥ,"ΑΡΓΟΛΙΔΑΣ, ΑΡΚΑΔΙΑΣ","ΑΡΓΟΥΣ -ΜΥΚΗΝΩΝ, ΤΡΙΠΟΛΗΣ, ΒΟΡΕΙΑΣ ΚΥΝΟΥΡΙΑΣ, ...","ΑΡΓΟΥΣ, ΒΑΛΤΕΤΣΙΟΥ, ΒΟΡΕΙΑΣ ΚΥΝΟΥΡΙΑΣ, ΛΥΡΚΕΙΑ...",ΑΓΙΟΣ ΠΕΤΡΟΣ-ΒΑΛΤΕΤΣΙ-ΝΕΣΤΑΝΗ-ΤΣΕΜΠΕΡΟΥ,298.200,ΑΙΟΛΙΚΑ,202403


In [13]:
all_ape_data.isna().sum()

ΑΙΤΗΣΗ                             62
ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ       101
ΕΤΑΙΡΕΙΑ                           26
ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ           5337
ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ       26
ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ      87
ΠΕΡΙΦΕΡΕΙΑ                         26
ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ               26
ΔΗΜΟΣ                              87
ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ                   34
ΘΕΣΗ                               26
ΙΣΧΥΣ (MW)                         26
ΤΕΧΝΟΛΟΓΙΑ                         26
file                                0
dtype: int64

### Preprocessing

In [14]:
all_ape_data_nodup = all_ape_data.drop(columns='file').drop_duplicates()

In [15]:
all_ape_data_nodup = convert_to_datetime(all_ape_data_nodup, columns=['ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ', 'ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ', 'ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ'])

Attempting column: ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ
Column ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ transformed successfully.

Attempting column: ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ
Column ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ transformed successfully.

Attempting column: ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ
Column ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ transformed successfully.



#### Investigation

In [16]:
all_ape_data_nodup.value_counts().head(30)

ΑΙΤΗΣΗ    ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ  ΕΤΑΙΡΕΙΑ                                                                              ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ  ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ  ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ  ΠΕΡΙΦΕΡΕΙΑ            ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ  ΔΗΜΟΣ                  ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ   ΘΕΣΗ                                                                       ΙΣΧΥΣ (MW)  ΤΕΧΝΟΛΟΓΙΑ  
Γ-013090  2021-08-06                    SUNSTAR ΙΔΙΩΤΙΚΗ ΚΕΦΑΛΑΙΟΥΧΙΚΗ ΕΤΑΙΡΕΙΑ                                              ΑΔ-09024                2021-10-13                    2046-10-13                     ΘΕΣΣΑΛΙΑΣ             ΛΑΡΙΣΑΣ               ΚΙΛΕΛΕΡ,ΦΑΡΣΑΛΩΝ       ΕΝΙΠΠΕΑ,ΚΡΑΝΝΩΝΟΣ  ΚΥΠΑΡΙΣΣΟΣ                                                                 255.000     ΦΩΤΟΒΟΛΤΑΪΚΑ    5
Γ-012305  2021-08-02                    ΑΓ ΚΩΝΣΤΑΝΤΙΝΟΣ 1 Ι Κ Ε                                                              ΑΔ-08675                2021-06-29                    2046-06-29              

In [17]:
all_ape_data_nodup.isna().sum()

ΑΙΤΗΣΗ                             62
ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ        71
ΕΤΑΙΡΕΙΑ                            4
ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ           5314
ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ       15
ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ      85
ΠΕΡΙΦΕΡΕΙΑ                          4
ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ                4
ΔΗΜΟΣ                              65
ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ                    9
ΘΕΣΗ                                4
ΙΣΧΥΣ (MW)                          4
ΤΕΧΝΟΛΟΓΙΑ                          4
dtype: int64

In [18]:
all_ape_data_nodup["ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ"].value_counts()

ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ
ΑΔ-02383    11
ΑΔ-01900    11
ΑΔ-06102    10
ΑΔ-06723    10
ΑΔ-06006    10
            ..
ΑΔ-08638     1
ΑΔ-08637     1
ΑΔ-08618     1
ΑΔ-08617     1
ΑΔ-08306     1
Name: count, Length: 6768, dtype: int64

#### RSI Calculation

In [19]:
all_ape_data_nodup_rsi = add_rsi(all_ape_data_nodup)

In [20]:
all_ape_data_nodup_rsi

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,RSI,RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,RSI_ΔΗΜΟΣ,RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,RSI_ΘΕΣΗ
0,514,2001-02-19,ΛΑΚΜΟΣ ΕΝΕΡΓΕΙΑΚΗ ΑΕ,ΑΔ-00001,2001-05-29,2026-05-29,ΗΠΕΙΡΟΥ,ΙΩΑΝΝΙΝΩΝ,ΜΕΤΣΟΒΟΥ,ΕΓΝΑΤΙΑΣ,ΓΚΟΥΡΑ ΜΙΚΡΟ ΠΕΡΙΣΤΕΡΙ,2.155,ΜΥΗΕ,3.584713,4.546388,2.459849,5.373287,9.019446
1,67,2001-02-13,ΑΡΑΜΠΑΤΖΗΣ Β.Γ. ΑΕ,ΑΔ-00002,2001-05-29,2026-05-29,ΚΕΝΤΡΙΚΗΣ ΜΑΚΕΔΟΝΙΑΣ,ΠΕΛΛΑΣ,ΑΛΜΩΠΙΑΣ,ΑΡΙΔΑΙΑΣ,ΤΟΥΠΛΙΤΣΑ ΠΡΟΜΑΧΩΝΑ,0.830,ΜΥΗΕ,1.191248,6.667842,8.607129,8.531909,9.019446
2,38,2001-02-08,ΚΑΣΤΡΙ ΕΥΒΟΙΑΣ ΑΕ,ΑΔ-00022,2001-06-18,2041-03-22,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΚΑΡΥΣΤΟΥ,ΚΑΣΤΡΙ,5.000,ΑΙΟΛΙΚΑ,7.230080,5.585120,6.619634,2.944487,3.527086
3,439,2001-02-19,ΥΔΡΟΗΛΕΚΤΡΙΚΗ ΕΥΡΥΤΑΝΙΑΣ ΑΕ,ΑΔ-00023,2001-06-18,2026-06-18,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΡΥΤΑΝΙΑΣ,ΑΓΡΑΦΩΝ,ΑΓΡΑΦΩΝ,ΜΟΝΑΣΤΗΡΑΚΙ,3.000,ΜΥΗΕ,0.614962,5.595114,4.538448,4.919698,6.313612
4,65,2001-02-12,ΠΟΛΥΠΟΤΑΜΟΣ ΑΙΟΛΙΚΗ ΕΝΕΡΓΕΙΑ ΑΕ,ΑΔ-00024,2001-06-18,2026-06-18,ΣΤΕΡΕΑΣ ΕΛΛΑΔΟΣ,ΕΥΒΟΙΑΣ,ΚΑΡΥΣΤΟΥ,ΣΤΥΡΕΩΝ,ΓΚΕΡΚΙ-ΠΥΡΓΑΡΙ ΠΟΛΥΠΟΤΑΜΟΥ,12.000,ΑΙΟΛΙΚΑ,7.230080,5.585120,6.619634,6.046433,8.565779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39419,Γ-06245,2017-06-12,F-GRID ΝΕΜΕΣΙΣ ΑΙΟΛΙΚΗ ΙΔΙΩΤΙΚΗ ΚΕΦΑΛΑΙΟΥΧΙΚΗ ...,ΑΔ-04523,2021-07-29,2046-07-29,ΣΤ ΕΛΛΑΔΑΣ,ΕΥΒΟΙΑΣ,ΚΥΜΗΣ - ΑΛΙΒΕΡΙΟΥ,"ΚΟΝΙΣΤΡΩΝ,ΤΑΜΥΝΕΩΝ",ΜΑΚΡΥΑ ΛΕΝΙΑ,25.000,ΑΙΟΛΙΚΑ,1.593403,2.256876,2.720094,1.929145,1.929145
39420,Γ-05876,2014-09-10,ΤΕΡΝΑ ΕΝΕΡΓΕΙΑΚΗ ΑΝΩΝΥΜΗ ΒΙΟΜΗΧΑΝΙΚΗ ΕΜΠΟΡΙΚΗ ...,ΑΔ-04525,2021-07-29,2046-07-29,ΣΤ ΕΛΛΑΔΑΣ,ΕΥΒΟΙΑΣ,"ΚΥΜΗΣ - ΑΛΙΒΕΡΙΟΥ, ΚΑΡΥΣΤΟΥ",ΑΥΛΩΝΟΣ - ΔΥΣΤΥΩΝ - ΜΑΡΜΑΡΙΟΥ - ΣΤΥΡΕΩΝ,ΑΣΠΡΗ ΠΕΤΡΑ - ΠΥΡΓΑΡΙ - ΑΧΛΑΔΙΕΣ,111.500,ΑΙΟΛΙΚΑ,1.593403,2.256876,1.929145,2.893717,2.893717
39421,Γ-05948,2015-06-05,ΤΕΡΝΑ ΕΝΕΡΓΕΙΑΚΗ ΑΝΩΝΥΜΗ ΒΙΟΜΗΧΑΝΙΚΗ ΕΜΠΟΡΙΚΗ ...,ΑΔ-04526,2021-07-29,2046-07-29,ΣΤ ΕΛΛΑΔΑΣ,ΕΥΒΟΙΑΣ,ΚΥΜΗΣ - ΑΛΙΒΕΡΙΟΥ,ΔΥΣΤΥΩΝ,ΑΓΙΑ ΤΡΙΑΣ,4.200,ΑΙΟΛΙΚΑ,1.593403,2.256876,2.720094,2.723498,0.890374
39422,Γ-02883,2007-10-09,ΕΥΒΟΪΚΟΣ ΑΝΕΜΟΣ ΑΝΩΝΥΜΗ ΕΤΑΙΡΕΙΑ ΠΑΡΑΓΩΓΗΣ ΗΛΕ...,ΑΔ-04529,2021-07-15,2046-07-15,ΣΤ ΕΛΛΑΔΑΣ,ΕΥΒΟΙΑΣ,ΔΙΡΦΥΩΝ - ΜΕΣΣΑΠΙΩΝ,ΔΙΡΦΥΩΝ,ΨΗΛΟΣ ΒΡΑΧΟΣ ΜΕΣΣΑΠΙΩΝ,25.300,ΑΙΟΛΙΚΑ,1.593403,2.256876,2.066941,2.893717,2.893717


### Interpretation
- RSI > 1: The region is more specialized in that type of RES compared to the national average.
- RSI < 1: The region is less specialized in that type of RES compared to the national average.
- RSI = 1: The region has an average specialization in that type of RES compared to the national average.
<br><br>
By calculating and analyzing the RSI, you can identify which regions or companies are particularly focused on specific types of renewable energy, providing valuable insights into regional strengths and specialization in the renewable energy sector.

#### Add diff between issue and expiration

In [21]:
all_ape_data_nodup_rsi["ΔΙΑΡΚΕΙΑ"] = (all_ape_data_nodup_rsi['ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ'] - all_ape_data_nodup_rsi['ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ']).dt.days

In [22]:
all_ape_data_nodup_rsi.head(2)

Unnamed: 0,ΑΙΤΗΣΗ,ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ,ΕΤΑΙΡΕΙΑ,ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ,ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ,ΠΕΡΙΦΕΡΕΙΑ,ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,ΔΗΜΟΣ,ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,ΘΕΣΗ,ΙΣΧΥΣ (MW),ΤΕΧΝΟΛΟΓΙΑ,RSI,RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ,RSI_ΔΗΜΟΣ,RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ,RSI_ΘΕΣΗ,ΔΙΑΡΚΕΙΑ
0,514,2001-02-19,ΛΑΚΜΟΣ ΕΝΕΡΓΕΙΑΚΗ ΑΕ,ΑΔ-00001,2001-05-29,2026-05-29,ΗΠΕΙΡΟΥ,ΙΩΑΝΝΙΝΩΝ,ΜΕΤΣΟΒΟΥ,ΕΓΝΑΤΙΑΣ,ΓΚΟΥΡΑ ΜΙΚΡΟ ΠΕΡΙΣΤΕΡΙ,2.155,ΜΥΗΕ,3.584713,4.546388,2.459849,5.373287,9.019446,9131.0
1,67,2001-02-13,ΑΡΑΜΠΑΤΖΗΣ Β.Γ. ΑΕ,ΑΔ-00002,2001-05-29,2026-05-29,ΚΕΝΤΡΙΚΗΣ ΜΑΚΕΔΟΝΙΑΣ,ΠΕΛΛΑΣ,ΑΛΜΩΠΙΑΣ,ΑΡΙΔΑΙΑΣ,ΤΟΥΠΛΙΤΣΑ ΠΡΟΜΑΧΩΝΑ,0.83,ΜΥΗΕ,1.191248,6.667842,8.607129,8.531909,9.019446,9131.0


### Time series feature engineering

In [48]:
# Aggregate MW data by month
monthly_aggregation = all_ape_data_nodup_rsi.groupby(pd.Grouper(key='ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ', freq='M')).agg({
    'ΙΣΧΥΣ (MW)': ['count','sum', 'mean', 'min', 'max'],
    'RSI': ['mean', 'min', 'max'],
    'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ': ['mean', 'min', 'max'],
    'RSI_ΔΗΜΟΣ ': ['mean', 'min', 'max'],
    'RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ': ['mean', 'min', 'max'],
    'RSI_ΘΕΣΗ': ['mean', 'min', 'max'],
})

monthly_aggregation = monthly_aggregation.dropna()

# Extend DataFrame with future dates
future_dates = pd.date_range(start=monthly_aggregation.index[-1], periods=12, freq='M') + pd.DateOffset(months=1)
future_index = pd.DatetimeIndex(future_dates)
future_data = pd.DataFrame(index=future_index)

# Concatenate current and future data
monthly_aggregation = pd.concat([monthly_aggregation, future_data])

In [49]:
# Flatten the multi-index columns
monthly_aggregation.columns = ['_'.join(col).strip() for col in monthly_aggregation.columns.values]

In [50]:
monthly_aggregation

Unnamed: 0,ΙΣΧΥΣ (MW)_count,ΙΣΧΥΣ (MW)_sum,ΙΣΧΥΣ (MW)_mean,ΙΣΧΥΣ (MW)_min,ΙΣΧΥΣ (MW)_max,RSI_mean,RSI_min,RSI_max,RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_mean,RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_min,RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_max,RSI_ΔΗΜΟΣ _mean,RSI_ΔΗΜΟΣ _min,RSI_ΔΗΜΟΣ _max,RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ_mean,RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ_min,RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ_max,RSI_ΘΕΣΗ_mean,RSI_ΘΕΣΗ_min,RSI_ΘΕΣΗ_max
1988-12-31,2.0,17.00,8.500000,8.500,8.500,0.614962,0.614962,0.614962,0.973781,0.973781,0.973781,0.943573,0.943573,0.943573,1.288492,1.288492,1.288492,9.019446,9.019446,9.019446
1990-12-31,2.0,0.33,0.165000,0.165,0.165,0.718299,0.718299,0.718299,2.121334,2.121334,2.121334,2.893717,2.893717,2.893717,2.893717,2.893717,2.893717,2.893717,2.893717,2.893717
1991-12-31,6.0,2.27,0.378333,0.300,0.450,0.478968,0.113554,0.718299,1.586117,0.568794,2.143494,4.459848,2.095450,9.019446,4.725937,2.264648,9.019446,4.397692,1.279913,9.019446
1992-12-31,14.0,47.51,3.393571,0.700,10.800,0.752677,0.605050,1.191248,2.482484,2.016114,4.040385,2.464288,2.073111,3.616935,3.276151,0.826776,9.019446,3.663932,2.159490,9.019446
1993-12-31,4.0,13.20,3.300000,0.600,6.000,1.574075,1.191248,1.956902,1.619850,1.219773,2.019928,2.394163,2.066941,2.721385,3.255486,2.130097,4.380874,5.956582,2.893717,9.019446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-30,,,,,,,,,,,,,,,,,,,,
2024-11-30,,,,,,,,,,,,,,,,,,,,
2024-12-30,,,,,,,,,,,,,,,,,,,,
2025-01-31,,,,,,,,,,,,,,,,,,,,


In [36]:
# def calculate_rolling_metrics(data, window_sizes=[3, 6, 12]):
#     rolling_metrics = pd.DataFrame(index=data.index)

#     for window_size in window_sizes:
#         rolling_metrics[f'MW_mean_rolling_{window_size}'] = data.rolling(window=window_size).mean()
#         rolling_metrics[f'MW_std_rolling_{window_size}'] = data.rolling(window=window_size).std()
#         rolling_metrics[f'MW_min_rolling_{window_size}'] = data.rolling(window=window_size).min()
#         rolling_metrics[f'MW_max_rolling_{window_size}'] = data.rolling(window=window_size).max()
#         rolling_metrics[f'MW_ewm_{window_size}'] = data.ewm(span=window_size).mean()

#     return rolling_metrics

# def calculate_autocorrelation(data, lags=[1, 2, 3]):
#     autocorr = pd.DataFrame(index=data.index)

#     for lag in lags:
#         autocorr[f'MW_autocorr_lag_{lag}'] = data.autocorr(lag=lag)

#     return autocorr

# def create_lagged_variables(data, lag_lengths=[1, 2, 3]):
#     lagged_data = pd.DataFrame(index=data.index)

#     for lag_length in lag_lengths:
#         lagged_data[f'MW_sum_lagged_{lag_length}'] = data.shift(lag_length)

#     return lagged_data

# def calculate_seasonal_aggregations(data):
#     seasonal_aggregations = pd.DataFrame(index=data.index)

#     seasonal_aggregations['MW_mean_monthly'] = data.groupby(data.index.month).transform('mean')

#     return seasonal_aggregations

# def extract_time_based_features(data):
#     time_features = pd.DataFrame(index=data.index)

#     time_features['month'] = data.index.month
#     time_features['year'] = data.index.year

#     return time_features

# window_sizes = [i for i in range(1, 36)]
# lags = [i for i in range(1, 36)]

# rolling_metrics = calculate_rolling_metrics(monthly_aggregation['ΙΣΧΥΣ (MW)_sum'], window_sizes=window_sizes)
# autocorr = calculate_autocorrelation(monthly_aggregation['ΙΣΧΥΣ (MW)_sum'], lags=lags)
# lagged_data = create_lagged_variables(monthly_aggregation['ΙΣΧΥΣ (MW)_sum'], lag_lengths=lags)
# seasonal_aggregations = calculate_seasonal_aggregations(monthly_aggregation['ΙΣΧΥΣ (MW)_sum'])
# time_features = extract_time_based_features(monthly_aggregation)

# # Concatenate all the calculated features
# final_result = pd.concat([monthly_aggregation, rolling_metrics, autocorr, lagged_data, seasonal_aggregations, time_features], axis=1)

# final_result.tail(30)

In [53]:
from statsmodels.tsa.seasonal import seasonal_decompose

def calculate_rolling_metrics(data, mw_prefix='MW_', window_sizes=[3, 6, 12]):
    rolling_metrics = pd.DataFrame(index=data.index)

    for window_size in window_sizes:
        for feature in data.filter(like=mw_prefix).columns:
            rolling_metrics[f'{feature}_mean_rolling_{window_size}'] = data[feature].rolling(window=window_size).mean()
            rolling_metrics[f'{feature}_std_rolling_{window_size}'] = data[feature].rolling(window=window_size).std()
            rolling_metrics[f'{feature}_min_rolling_{window_size}'] = data[feature].rolling(window=window_size).min()
            rolling_metrics[f'{feature}_max_rolling_{window_size}'] = data[feature].rolling(window=window_size).max()
            rolling_metrics[f'{feature}_skew_rolling_{window_size}'] = data[feature].rolling(window=window_size).skew()
            rolling_metrics[f'{feature}_ewm_{window_size}'] = data[feature].ewm(span=window_size).mean()

    return rolling_metrics

def calculate_autocorrelation(data, mw_prefix='MW_', lags=[1, 2, 3]):
    autocorr = pd.DataFrame(index=data.index)

    for lag in lags:
        for feature in data.filter(like=mw_prefix).columns:
            autocorr[f'{feature}_autocorr_lag_{lag}'] = data[feature].autocorr(lag=lag)

    return autocorr

def create_lagged_variables(data, mw_prefix='MW_', lag_lengths=[1, 2, 3]):
    lagged_data = pd.DataFrame(index=data.index)

    for lag_length in lag_lengths:
        for feature in data.filter(like=mw_prefix).columns:
            lagged_data[f'{feature}_lagged_{lag_length}'] = data[feature].shift(lag_length)

    return lagged_data

def calculate_seasonal_aggregations(data, mw_prefix='MW_'):
    seasonal_aggregations = pd.DataFrame(index=data.index)

    for feature in data.filter(like=mw_prefix).columns:
        seasonal_aggregations[f'{feature}_mean_monthly'] = data[feature].groupby(data.index.month).transform('mean')

    return seasonal_aggregations

def extract_time_based_features(data):
    time_features = pd.DataFrame(index=data.index)

    time_features['month'] = data.index.month
    time_features['year'] = data.index.year

    return time_features

def calculate_expanding_metrics(data, mw_prefix='MW_'):
    expanding_metrics = pd.DataFrame(index=data.index)

    for feature in data.filter(like=mw_prefix).columns:
        expanding_metrics[f'{feature}_expanding_mean'] = data[feature].expanding().mean()
        expanding_metrics[f'{feature}_expanding_std'] = data[feature].expanding().std()

    return expanding_metrics

def calculate_seasonal_decomposition(data, mw_prefix='MW_'):
    decomposition = pd.DataFrame(index=data.index)

    for feature in data.filter(like=mw_prefix).columns:
        result = seasonal_decompose(data[feature], model='additive')
        decomposition[f'{feature}_trend'] = result.trend
        decomposition[f'{feature}_seasonal'] = result.seasonal
        decomposition[f'{feature}_residual'] = result.resid

    return decomposition


# window_sizes = [i for i in range(1, 36)]
# lags = [i for i in range(1, 36)]

mw_prefix = 'MW_'  # Assuming your MW features have this prefix in their column names
rsi_prefix = 'RSI_'  # Assuming your RSI features have this prefix in their column names

rolling_metrics_mw = calculate_rolling_metrics(monthly_aggregation, mw_prefix=mw_prefix)
autocorr_mw = calculate_autocorrelation(monthly_aggregation, mw_prefix=mw_prefix)
lagged_data_mw = create_lagged_variables(monthly_aggregation, mw_prefix=mw_prefix)
seasonal_aggregations_mw = calculate_seasonal_aggregations(monthly_aggregation, mw_prefix=mw_prefix)
expanding_metrics_mw = calculate_expanding_metrics(monthly_aggregation, mw_prefix=mw_prefix)
decomposition = calculate_seasonal_decomposition(monthly_aggregation, mw_prefix=mw_prefix)

rolling_metrics_rsi = calculate_rolling_metrics(monthly_aggregation, mw_prefix=rsi_prefix)
autocorr_rsi = calculate_autocorrelation(monthly_aggregation, mw_prefix=rsi_prefix)
lagged_data_rsi = create_lagged_variables(monthly_aggregation, mw_prefix=rsi_prefix)
seasonal_aggregations_rsi = calculate_seasonal_aggregations(monthly_aggregation, mw_prefix=rsi_prefix)
expanding_metrics_rsi = calculate_expanding_metrics(monthly_aggregation, mw_prefix=rsi_prefix)


# For time-based features, you can directly call the function without prefixes
time_features = extract_time_based_features(monthly_aggregation)

# You can concatenate the results for MW and RSI features if needed
final_result = pd.concat([rolling_metrics_mw, autocorr_mw, lagged_data_mw, seasonal_aggregations_mw, expanding_metrics_mw, decomposition,
                          rolling_metrics_rsi, autocorr_rsi, lagged_data_rsi, seasonal_aggregations_rsi, expanding_metrics_rsi,
                          time_features], axis=1)

final_result


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Unnamed: 0,RSI_mean_mean_rolling_3,RSI_mean_std_rolling_3,RSI_mean_min_rolling_3,RSI_mean_max_rolling_3,RSI_mean_skew_rolling_3,RSI_mean_ewm_3,RSI_min_mean_rolling_3,RSI_min_std_rolling_3,RSI_min_min_rolling_3,RSI_min_max_rolling_3,...,RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ_max_expanding_mean,RSI_ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ_max_expanding_std,RSI_ΘΕΣΗ_mean_expanding_mean,RSI_ΘΕΣΗ_mean_expanding_std,RSI_ΘΕΣΗ_min_expanding_mean,RSI_ΘΕΣΗ_min_expanding_std,RSI_ΘΕΣΗ_max_expanding_mean,RSI_ΘΕΣΗ_max_expanding_std,month,year
1988-12-31,,,,,,0.614962,,,,,...,1.288492,,9.019446,,9.019446,,9.019446,,12,1988
1990-12-31,,,,,,0.683853,,,,,...,2.091105,1.135065,5.956582,4.331545,5.956582,4.331545,5.956582,4.331545,12,1990
1991-12-31,0.604076,0.120036,0.478968,0.718299,-0.404750,0.566776,0.482272,0.323472,0.113554,0.718299,...,4.400552,4.079807,5.436952,3.192364,4.397692,4.083081,6.977537,3.536692,12,1991
1992-12-31,0.649981,0.149096,0.478968,0.752677,-1.629025,0.665923,0.478968,0.321483,0.113554,0.718299,...,5.555276,4.053406,4.993697,2.753184,3.838142,3.516639,7.488014,3.062865,12,1992
1993-12-31,0.935240,0.569923,0.478968,1.574075,1.293568,1.134647,0.636617,0.539540,0.113554,1.191248,...,5.320395,3.549425,5.186274,2.422901,3.649257,3.074647,7.794300,2.739509,12,1993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-30,,,,,,1.425447,,,,,...,114.637306,513.442142,25.751692,107.683124,3.530059,13.421973,295.494708,1298.855886,10,2024
2024-11-30,,,,,,1.425447,,,,,...,114.637306,513.442142,25.751692,107.683124,3.530059,13.421973,295.494708,1298.855886,11,2024
2024-12-30,,,,,,1.425447,,,,,...,114.637306,513.442142,25.751692,107.683124,3.530059,13.421973,295.494708,1298.855886,12,2024
2025-01-31,,,,,,1.425447,,,,,...,114.637306,513.442142,25.751692,107.683124,3.530059,13.421973,295.494708,1298.855886,1,2025


In [54]:
final_result.columns.values

array(['RSI_mean_mean_rolling_3', 'RSI_mean_std_rolling_3',
       'RSI_mean_min_rolling_3', 'RSI_mean_max_rolling_3',
       'RSI_mean_skew_rolling_3', 'RSI_mean_ewm_3',
       'RSI_min_mean_rolling_3', 'RSI_min_std_rolling_3',
       'RSI_min_min_rolling_3', 'RSI_min_max_rolling_3',
       'RSI_min_skew_rolling_3', 'RSI_min_ewm_3',
       'RSI_max_mean_rolling_3', 'RSI_max_std_rolling_3',
       'RSI_max_min_rolling_3', 'RSI_max_max_rolling_3',
       'RSI_max_skew_rolling_3', 'RSI_max_ewm_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_mean_mean_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_mean_std_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_mean_min_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_mean_max_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_mean_skew_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_mean_ewm_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_min_mean_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_min_std_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ_min_min_rolling_3',
       'RSI_ΠΕΡΙΦΕΡΕΙ

In [38]:
import plotly.graph_objs as go

# Create a trace for each MW column
traces = []
for column in ['ΙΣΧΥΣ (MW)_count', 'ΙΣΧΥΣ (MW)_sum', 'ΙΣΧΥΣ (MW)_mean', 'ΙΣΧΥΣ (MW)_min', 'ΙΣΧΥΣ (MW)_max']:
    trace = go.Scatter(x=final_result.index, y=final_result[column], mode='lines', name=column)
    traces.append(trace)

# Create layout
layout = go.Layout(title='MW Time Series',
                   xaxis=dict(title='Date'),
                   yaxis=dict(title='MW'))

# Create figure
fig = go.Figure(data=traces, layout=layout)

# Plot figure
fig.show()


In [29]:
# df= all_ape_data_nodup.drop(columns=['ΕΤΑΙΡΕΙΑ','ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟΣ ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', 'ΤΕΧΝΟΛΟΓΙΑ']).copy()

# # Convert date columns to datetime
# date_columns = ['ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ', 'ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ', 'ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ']


# # Extend the DataFrame to include future dates
# forecast_horizon = 360  # Number of days to forecast
# # Convert the DataFrame index to datetime
# df.index = pd.to_datetime(df['ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ'])

# df = df.drop(columns=["ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ"]).sort_values(by='ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ', ascending=True)

# # Now get the last_date
# last_date = df.index[-1]

# # Now this should work
# future_dates = [last_date + pd.DateOffset(days=x) for x in range(1, forecast_horizon + 1)]

# print(future_dates)
# # Create a DataFrame for future dates
# future_df = pd.DataFrame(index=future_dates, columns=df.columns)
# df = pd.concat([df, future_df])


# print(df.columns)

# # Example of different aggregation functions for different columns
# df = df.groupby(df.index).agg({
#     'ΑΙΤΗΣΗ': 'count',  # Counting the number of entries per date
#     'ΙΣΧΥΣ (MW)': ['sum', 'mean', 'max', 'min']  # Applying multiple aggregations to MW values
# })

# # # Flatten the column multi-index
# # # df.columns = ['_'.join(col).strip() for col in df_aggregated.columns.values]
# # df.rename(columns={
# #     'ΑΙΤΗΣΗ_count': 'count_ΑΙΤΗΣΗ',
# #     'ΙΣΧΥΣ (MW)_sum': 'total_ΙΣΧΥΣ (MW)',
# #     'ΙΣΧΥΣ (MW)_mean': 'mean_ΙΣΧΥΣ (MW)',
# #     'ΙΣΧΥΣ (MW)_max': 'max_ΙΣΧΥΣ (MW)',
# #     'ΙΣΧΥΣ (MW)_min': 'min_ΙΣΧΥΣ (MW)'
# # }, inplace=True)


# print(df.columns)

# # Extract date features
# for col in date_columns:
#     df[f'{col}_month'] = df[col].dt.month
#     df[f'{col}_day'] = df[col].dt.day
#     df[f'{col}_dayofweek'] = df[col].dt.dayofweek


# days_to_lag = 200

# for lag in range(0,days_to_lag):
#     # Create lag features for the target variable
#     df[f'ΙΣΧΥΣ (MW)_lag{lag}'] = df['ΙΣΧΥΣ (MW)'].shift(lag)

# # Compute rolling statistics for the target variable
# df['ΙΣΧΥΣ (MW)_rolling_mean'] = df['ΙΣΧΥΣ (MW)'].rolling(window=30).mean()
# df['ΙΣΧΥΣ (MW)_rolling_std'] = df['ΙΣΧΥΣ (MW)'].rolling(window=30).std()

# # Drop rows with NaN values created by lagging/rolling
# df = df.dropna()

# # Verify the DataFrame
# print(df.info())
# print(df.head())

In [30]:
# df

In [31]:
# import pandas as pd

# # Assuming 'all_ape_data_nodup' is your initial DataFrame
# df = all_ape_data_nodup.drop(columns=[
#     'ΕΤΑΙΡΕΙΑ', 'ΑΡ. ΜΗΤΡΩΟΥ ΑΔΕΙΩΝ ΡΑΕ', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 
#     'ΔΗΜΟΣ ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', 'ΤΕΧΝΟΛΟΓΙΑ', 'file'
# ]).copy()

# # Convert date columns to datetime
# date_columns = ['ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ ΑΙΤΗΣΗΣ', 'ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ', 'ΗΜΕΡΟΜΗΝΙΑ ΛΗΞΗΣ ΑΔ.ΠΑΡΑΓΩΓΗΣ']
# for col in date_columns:
#     df[col] = pd.to_datetime(df[col], errors='coerce')

# # Set the index to 'ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ' and sort
# df.set_index('ΗΜΕΡΟΜΗΝΙΑ ΕΚΔ. ΑΔ.ΠΑΡΑΓΩΓΗΣ', inplace=True)
# df.sort_index(inplace=True)

# # Extract date features before aggregation
# for col in date_columns:
#     df[f'{col}_month'] = df[col].dt.month
#     df[f'{col}_day'] = df[col].dt.day
#     df[f'{col}_dayofweek'] = df[col].dt.dayofweek

# # Aggregate the data by date
# df_aggregated = df.groupby(df.index).agg({
#     'ΑΙΤΗΣΗ': 'count',  # Counting the number of entries per date
#     'ΙΣΧΥΣ (MW)': ['sum', 'mean', 'max', 'min']  # Applying multiple aggregations to MW values
# })

# # Flatten the column multi-index
# df_aggregated.columns = ['_'.join(col).strip() for col in df_aggregated.columns.values]
# df_aggregated.rename(columns={
#     'ΑΙΤΗΣΗ_count': 'count_ΑΙΤΗΣΗ',
#     'ΙΣΧΥΣ (MW)_sum': 'total_ΙΣΧΥΣ (MW)',
#     'ΙΣΧΥΣ (MW)_mean': 'mean_ΙΣΧΥΣ (MW)',
#     'ΙΣΧΥΣ (MW)_max': 'max_ΙΣΧΥΣ (MW)',
#     'ΙΣΧΥΣ (MW)_min': 'min_ΙΣΧΥΣ (MW)'
# }, inplace=True)

# # Extend the DataFrame to include future dates
# forecast_horizon = 360  # Number of days to forecast
# last_date = df_aggregated.index[-1]
# future_dates = [last_date + pd.DateOffset(days=x) for x in range(1, forecast_horizon + 1)]
# future_df = pd.DataFrame(index=future_dates, columns=df_aggregated.columns)
# df_aggregated = pd.concat([df_aggregated, future_df])

# # Generate lag features
# days_to_lag = 200
# for lag in range(1, days_to_lag + 1):
#     df_aggregated[f'ΙΣΧΥΣ (MW)_lag{lag}'] = df_aggregated['total_ΙΣΧΥΣ (MW)'].shift(lag)

# # Compute rolling statistics
# df_aggregated['ΙΣΧΥΣ (MW)_rolling_mean'] = df_aggregated['total_ΙΣΧΥΣ (MW)'].rolling(window=30).mean()
# df_aggregated['ΙΣΧΥΣ (MW)_rolling_std'] = df_aggregated['total_ΙΣΧΥΣ (MW)'].rolling(window=30).std()

# # Drop rows with NaN values created by lagging/rolling
# df_aggregated.dropna(inplace=True)

# # Verify the DataFrame
# print(df_aggregated.info())
# print(df_aggregated.head())


In [32]:

# # Define the directory paths and column patterns
# directory_paths = ["../data/licenses/", "../data/contracts/"]
# column_patterns = ['ΑΙΤΗΣΗ', 'ΜΗΤΡΩΟ', 'ΗΜΕΡΟΜΗΝΙΑ ΥΠΟΒΟΛΗΣ.*', 'ΕΤΑΙΡΕΙΑ', 'ΗΜΕΡ.*ΕΚΔ.*ΠΑΡΑΓ.*', 'ΗΜΕΡ.*ΛΗΞΗ.*ΠΑΡΑΓ.*', 'ΠΕΡΙΦΕΡΕΙΑ', 'ΠΕΡΙΦΕΡΕΙΑΚΗ ΕΝΟΤΗΤΑ', 'ΔΗΜΟ.* ', 'ΔΗΜΟΤΙΚΗ ΕΝΟΤΗΤΑ', 'ΘΕΣΗ', '.*(MW)', 'ΤΕΧΝΟΛ.*'] 

# # Iterate over each directory path
# for directory_path in directory_paths:
#     # Print the current working directory
#     print(f"Current working directory: {os.getcwd()}")
#     # Get the directory of the current file
#     current_directory = os.path.dirname(os.path.abspath(__file__))
#     # Define the directory containing Excel files
#     directory = os.path.join(current_directory, directory_path)
#     file_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".xlsx")]

# # Sort file paths based on their names to process them in order
# file_paths.sort()

# merged_df = merge_dataframes_with_previous_files(file_paths, column_patterns)
