In [18]:
import pandas as pd
import os
import re

def extract_currency_tables(file_path):
    # Read the first sheet of the Excel file
    df = pd.read_excel(file_path, sheet_name=0, header=None)
    # Limit to columns A:J (indices 0 to 9)
    df = df.iloc[:, :10]
    
    # Initialize variables
    num_rows, num_cols = df.shape
    processing = False
    currency_tables = {}
    current_currency = None
    start_row = None
    terms = None
    term_indices = None

    for idx in range(num_rows):
        row = df.iloc[idx]
        row_values = row.fillna('').astype(str).str.strip().tolist()
        row_lower = [s.lower() for s in row_values]
        row_string = ' '.join(row_lower)

        # Stop processing if 'Накопительный счет' is found
        if 'накопительный счет' in row_string:
            print(f"'Накопительный счет' found at row {idx}. Stopping processing.")
            break

        # Check for 'Безопциональный' to start processing
        if 'безопциональный' in row_string:
            processing = True
            current_currency = None
            start_row = None
            continue

        if not processing:
            continue

        # Check for currency
        currency_match = None
        for cell in row_lower:
            m = re.match(r'фл\s*(\w+)', cell)
            if m:
                currency_match = m
                break
        if currency_match:
            current_currency = currency_match.group(1).capitalize()
            print(f"Currency '{current_currency}' found at row {idx}.")
            start_row = None
            terms = None
            term_indices = None
            continue

        if current_currency:
            # Look for 'Срок' row to get terms
            if 'срок' in row_lower:
                term_indices = [i for i, cell in enumerate(row_lower) if cell not in ['', 'срок']]
                terms = [row_values[i] for i in term_indices]
                print(f"Terms found for currency '{current_currency}' at row {idx}: {terms}")
                # Record the start row of the data (next row)
                start_row = idx + 1
                continue

            # Check for 'Среднее', 'ТС на', or empty row to end processing of current table
            if 'среднее' in row_string or 'тс на' in row_string or not any(row_values):
                if start_row is not None and terms:
                    end_row = idx
                    # Extract the table for current currency
                    currency_df = df.iloc[start_row:end_row, :]
                    # Store the DataFrame and terms in the dictionary
                    currency_tables[current_currency] = {
                        'terms': terms,
                        'term_indices': term_indices,
                        'data': currency_df.reset_index(drop=True)
                    }
                    print(f"Extracted data for currency '{current_currency}' from row {start_row} to {end_row}.")
                    # Reset for next currency
                    current_currency = None
                    start_row = None
                    terms = None
                    term_indices = None
                continue

    return currency_tables

def standardize_term(term):
    """Standardize term to days."""
    term = term.lower().replace(',', '.').strip()
    if 'мес' in term:
        number = re.findall(r'\d+\.?\d*', term)
        if number:
            return str(int(float(number[0]) * 30))
    elif 'год' in term or 'лет' in term:
        number = re.findall(r'\d+\.?\d*', term)
        if number:
            return str(int(float(number[0]) * 365))
    else:
        number = re.findall(r'\d+', term)
        if number:
            return number[0]
    return term  # Return the original term if it can't be standardized


def process_currency_data(currency_tables, date):
    all_data = []
    for currency, info in currency_tables.items():
        terms = [standardize_term(term) for term in info['terms']]
        term_indices = info['term_indices']
        data_df = info['data']
        num_rows, num_cols = data_df.shape

        for idx in range(num_rows):
            row = data_df.iloc[idx]
            row_values = row.fillna('').astype(str).str.strip().tolist()
            # Adjusted index for bank name
            bank_name = row_values[2]  # Bank names are in column index 2
            if bank_name == '':
                continue  # Skip rows without bank names
            # Clean bank name
            bank_name = re.sub(r'\s*\(.*?\)', '', bank_name).strip()
            rates = []
            for i in term_indices:
                # Ensure index is within bounds
                if i >= len(row_values):
                    rates.append(None)
                    continue
                rate_cell = row_values[i].strip()
                rate_clean = re.sub(r'[^\d.,%]', '', rate_cell).replace(',', '.')
                # Handle rates like '19%12', take numeric part before '%'
                if '%' in rate_clean:
                    rate_clean = rate_clean.split('%')[0]
                try:
                    rate_value = float(rate_clean)
                    # Scale rates if they are percentages greater than 1
                    if rate_value > 1:
                        rate_value /= 100
                    rates.append(rate_value)
                except ValueError:
                    rates.append(None)

            # Add data for each term
            for term, rate in zip(terms, rates):
                if rate is not None:
                    all_data.append({
                        'Date': date,
                        'Bank': bank_name,
                        'Currency': currency,
                        'Term': term,
                        'Rate': rate
                    })
    return pd.DataFrame(all_data)


def extract_deposit_rates_from_file(file_path):
    # Extract date from the filename
    filename = os.path.basename(file_path)
    date_match = re.search(r'\d{2}\.\d{2}\.\d{4}', filename)
    date = date_match.group(0) if date_match else 'Date not found'
    
    # Extract currency tables
    currency_tables = extract_currency_tables(file_path)
    
    # Process currency data to get the final DataFrame
    result_df = process_currency_data(currency_tables, date)
    return result_df

# Example usage
file_path = '/Users/mark/Desktop/Новая папка/примерный вид 14.09.2022.xlsx'
final_data = extract_deposit_rates_from_file(file_path)
print(final_data)


Currency 'Рубли' found at row 2.
Terms found for currency 'Рубли' at row 3: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Рубли' from row 4 to 7.
Currency 'Доллары' found at row 11.
Terms found for currency 'Доллары' at row 12: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Доллары' from row 13 to 16.
Currency 'Юань' found at row 27.
Terms found for currency 'Юань' at row 28: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Юань' from row 29 to 32.
Currency 'Евро' found at row 36.
Terms found for currency 'Евро' at row 37: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Евро' from row 38 to 41.
'Накопительный счет' found at row 48. Stopping processing.
          Date    Bank Currency  Term  Rate
0   14.09.2022  Банк 1    Рубли    31  0.21
1   14.09.2022  Банк 1    Рубли    91  0.21
2   14.09.2022  Банк 1    Рубли   181  0.21
3   1

In [19]:
final_data

Unnamed: 0,Date,Bank,Currency,Term,Rate
0,14.09.2022,Банк 1,Рубли,31,0.21
1,14.09.2022,Банк 1,Рубли,91,0.21
2,14.09.2022,Банк 1,Рубли,181,0.21
3,14.09.2022,Банк 1,Рубли,365,0.21
4,14.09.2022,Банк 1,Рубли,548,0.21
5,14.09.2022,Банк 1,Рубли,730,0.21
6,14.09.2022,Банк 1,Рубли,1100,0.21
7,14.09.2022,Банк 2,Рубли,91,0.19
8,14.09.2022,Банк 2,Рубли,181,0.19
9,14.09.2022,Банк 2,Рубли,365,0.19


In [22]:
import pandas as pd
import os
import re
import pickle

def extract_currency_tables(file_path):
    """
    Extracts currency tables from the Excel file.
    """
    # Read the first sheet of the Excel file
    df = pd.read_excel(file_path, sheet_name=0, header=None)
    # Limit to columns A:J (indices 0 to 9)
    df = df.iloc[:, :10]
    
    # Initialize variables
    num_rows, num_cols = df.shape
    processing = False
    currency_tables = {}
    current_currency = None
    start_row = None
    terms = None
    term_indices = None

    for idx in range(num_rows):
        row = df.iloc[idx]
        row_values = row.fillna('').astype(str).str.strip().tolist()
        row_lower = [s.lower() for s in row_values]
        row_string = ' '.join(row_lower)

        # Stop processing if 'Накопительный счет' is found
        if 'накопительный счет' in row_string:
            print(f"'Накопительный счет' found at row {idx}. Stopping processing.")
            break

        # Check for 'Безопциональный' to start processing
        if 'безопциональный' in row_string:
            processing = True
            current_currency = None
            start_row = None
            continue

        if not processing:
            continue

        # Check for currency
        currency_match = None
        for cell in row_lower:
            m = re.match(r'фл\s*(\w+)', cell)
            if m:
                currency_match = m
                break
        if currency_match:
            current_currency = currency_match.group(1).capitalize()
            print(f"Currency '{current_currency}' found at row {idx}.")
            start_row = None
            terms = None
            term_indices = None
            continue

        if current_currency:
            # Look for 'Срок' row to get terms
            if 'срок' in row_lower:
                term_indices = [i for i, cell in enumerate(row_lower) if cell not in ['', 'срок']]
                terms = [row_values[i] for i in term_indices]
                print(f"Terms found for currency '{current_currency}' at row {idx}: {terms}")
                # Record the start row of the data (next row)
                start_row = idx + 1
                continue

            # Check for 'Среднее', 'ТС на', or empty row to end processing of current table
            if 'среднее' in row_string or 'тс на' in row_string or not any(row_values):
                if start_row is not None and terms:
                    end_row = idx
                    # Extract the table for current currency
                    currency_df = df.iloc[start_row:end_row, :]
                    # Store the DataFrame and terms in the dictionary
                    currency_tables[current_currency] = {
                        'terms': terms,
                        'term_indices': term_indices,
                        'data': currency_df.reset_index(drop=True)
                    }
                    print(f"Extracted data for currency '{current_currency}' from row {start_row} to {end_row}.")
                    # Reset for next currency
                    current_currency = None
                    start_row = None
                    terms = None
                    term_indices = None
                continue

    return currency_tables

def standardize_term(term):
    """
    Standardizes terms to days.
    """
    term = term.lower().replace(',', '.').strip()
    if 'мес' in term:
        number = re.findall(r'\d+\.?\d*', term)
        if number:
            return str(int(float(number[0]) * 30))
    elif 'год' in term or 'лет' in term:
        number = re.findall(r'\d+\.?\d*', term)
        if number:
            return str(int(float(number[0]) * 365))
    else:
        number = re.findall(r'\d+', term)
        if number:
            return number[0]
    return term  # Return the original term if it can't be standardized

def process_currency_data(currency_tables, date):
    """
    Processes currency tables to extract 'Bank', 'Term', and 'Rate'.
    """
    all_data = []
    for currency, info in currency_tables.items():
        terms = [standardize_term(term) for term in info['terms']]
        term_indices = info['term_indices']
        data_df = info['data']
        num_rows, num_cols = data_df.shape

        for idx in range(num_rows):
            row = data_df.iloc[idx]
            row_values = row.fillna('').astype(str).str.strip().tolist()
            # Find the bank name in the first few columns (e.g., indices 0 to 2)
            bank_name = ''
            for bank_idx in range(0, 3):
                if bank_idx < len(row_values) and row_values[bank_idx]:
                    bank_name = row_values[bank_idx]
                    break
            if bank_name == '':
                continue  # Skip rows without bank names
            # Clean bank name
            bank_name = re.sub(r'\s*\(.*?\)', '', bank_name).strip()
            rates = []
            for i in term_indices:
                if i >= len(row_values):
                    rates.append(None)
                    continue
                rate_cell = row_values[i].strip()
                rate_clean = re.sub(r'[^\d.,%]', '', rate_cell).replace(',', '.')
                if '%' in rate_clean:
                    rate_clean = rate_clean.split('%')[0]
                try:
                    rate_value = float(rate_clean)
                    if rate_value > 1:
                        rate_value /= 100
                    rates.append(rate_value)
                except ValueError:
                    rates.append(None)

            # Add data for each term
            for term, rate in zip(terms, rates):
                if rate is not None:
                    all_data.append({
                        'Date': date,
                        'Bank': bank_name,
                        'Currency': currency,
                        'Term': term,
                        'Rate': rate
                    })
    return pd.DataFrame(all_data)

def extract_deposit_rates_from_file(file_path):
    """
    Extracts deposit rates from a single file.
    """
    # Extract date from the filename
    filename = os.path.basename(file_path)
    date_match = re.search(r'\d{2}\.\d{2}\.\d{4}', filename)
    if date_match:
        date = date_match.group(0)
    else:
        print(f"Date not found in filename {filename}. Skipping file.")
        return pd.DataFrame()
    
    # Extract currency tables
    try:
        currency_tables = extract_currency_tables(file_path)
    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        return pd.DataFrame()
    
    # Process currency data to get the final DataFrame
    result_df = process_currency_data(currency_tables, date)
    return result_df

def process_folder(folder_path):
    """
    Processes all Excel files in a folder, allowing you to review and accept/reject data from each file.
    Returns a DataFrame with the collected data.
    """
    all_data = []
    # Iterate over all Excel files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx') or filename.endswith('.xls'):
            file_path = os.path.join(folder_path, filename)
            print(f"\nProcessing file: {filename}")
            # Extract data from the file
            data = extract_deposit_rates_from_file(file_path)
            if data.empty:
                print(f"No data extracted from {filename} or an error occurred.")
                continue
            # Display the extracted data for review
            print(f"Extracted data from {filename}:")
            print(data)
            # Prompt user to accept or reject the data
            while True:
                user_input = input(f"Do you want to include data from {filename}? (y/n): ").strip().lower()
                if user_input == 'y':
                    all_data.append(data)
                    print(f"Data from {filename} accepted.")
                    break
                elif user_input == 'n':
                    print(f"Data from {filename} rejected.")
                    break
                else:
                    print("Invalid input. Please enter 'y' or 'n'.")
    # Concatenate all accepted data
    if all_data:
        result_df = pd.concat(all_data, ignore_index=True)
    else:
        result_df = pd.DataFrame()
    return result_df

def save_data(data_df, excel_file_path, dict_file_path):
    """
    Saves the data DataFrame to an Excel file and as a pickle file.
    """
    # Save DataFrame to Excel
    data_df.to_excel(excel_file_path, index=False)
    print(f"Data saved to Excel file: {excel_file_path}")
    # Save DataFrame as a dictionary to a pickle file
    data_dict = data_df.to_dict('records')
    with open(dict_file_path, 'wb') as f:
        pickle.dump(data_dict, f)
    print(f"Data saved as dictionary to file: {dict_file_path}")

def load_data(dict_file_path):
    """
    Loads data from a pickle file and returns it as a DataFrame.
    """
    with open(dict_file_path, 'rb') as f:
        data_dict = pickle.load(f)
    data_df = pd.DataFrame(data_dict)
    return data_df

# Example usage:
if __name__ == "__main__":
    folder_path = '/Users/mark/Desktop/Новая папка'  # Replace with your folder path
    # Process all files in the folder
    collected_data = process_folder(folder_path)
    # Save the data
    if not collected_data.empty:
        excel_file_path = '/Users/mark/Desktop/Новая папка 2/collected_data.xlsx'  # Replace with your desired Excel file path
        dict_file_path = '/Users/mark/Desktop/Новая папка 2/collected_data.pkl'    # Replace with your desired pickle file path
        save_data(collected_data, excel_file_path, dict_file_path)
    else:
        print("No data was collected.")

    # Later, to add new data from new files:
    # # Load existing data
    # existing_data_df = load_data(dict_file_path)
    # # Process new files (ensure that only new files are in the folder or adjust the code to handle only new files)
    # new_data_df = process_folder(folder_path)
    # # Combine data
    # combined_data_df = pd.concat([existing_data_df, new_data_df], ignore_index=True)
    # # Save updated data
    # save_data(combined_data_df, excel_file_path, dict_file_path)



Processing file: ~$23.09.2024.xlsx
Error processing file ~$23.09.2024.xlsx: Excel file format cannot be determined, you must specify an engine manually.
No data extracted from ~$23.09.2024.xlsx or an error occurred.

Processing file: примерный вид 14.09.2023.xlsx
Currency 'Рубли' found at row 2.
Terms found for currency 'Рубли' at row 3: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Рубли' from row 4 to 7.
Currency 'Доллары' found at row 11.
Terms found for currency 'Доллары' at row 12: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Доллары' from row 13 to 16.
Currency 'Юань' found at row 27.
Terms found for currency 'Юань' at row 28: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Юань' from row 29 to 32.
Currency 'Евро' found at row 36.
Terms found for currency 'Евро' at row 37: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 

Do you want to include data from примерный вид 14.09.2023.xlsx? (y/n):  y


Data from примерный вид 14.09.2023.xlsx accepted.

Processing file: примерный вид 14.09.2022.xlsx
Currency 'Рубли' found at row 2.
Terms found for currency 'Рубли' at row 3: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Рубли' from row 4 to 7.
Currency 'Доллары' found at row 11.
Terms found for currency 'Доллары' at row 12: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Доллары' from row 13 to 16.
Currency 'Юань' found at row 27.
Terms found for currency 'Юань' at row 28: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Юань' from row 29 to 32.
Currency 'Евро' found at row 36.
Terms found for currency 'Евро' at row 37: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Евро' from row 38 to 41.
'Накопительный счет' found at row 48. Stopping processing.
Extracted data from примерный вид 14.09.2022.xlsx:
          Date    Bank Curren

Do you want to include data from примерный вид 14.09.2022.xlsx? (y/n):  y


Data from примерный вид 14.09.2022.xlsx accepted.

Processing file: 23.09.2024.xlsx
Currency 'Рубли' found at row 2.
Terms found for currency 'Рубли' at row 3: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Рубли' from row 4 to 7.
Currency 'Доллары' found at row 11.
Terms found for currency 'Доллары' at row 12: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Доллары' from row 13 to 16.
Currency 'Юань' found at row 27.
Terms found for currency 'Юань' at row 28: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Юань' from row 29 to 32.
Currency 'Евро' found at row 36.
Terms found for currency 'Евро' at row 37: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Евро' from row 38 to 41.
'Накопительный счет' found at row 48. Stopping processing.
Extracted data from 23.09.2024.xlsx:
          Date    Bank Currency  Term     Rate
0   23.09.20

Do you want to include data from 23.09.2024.xlsx? (y/n):  y


Data from 23.09.2024.xlsx accepted.
Data saved to Excel file: /Users/mark/Desktop/Новая папка 2/collected_data.xlsx
Data saved as dictionary to file: /Users/mark/Desktop/Новая папка 2/collected_data.pkl


In [23]:
for_new_files_folder_path = '/Users/mark/Desktop/Новая папка — копия'
existing_data_df = load_data(dict_file_path)
# Process new files (ensure that only new files are in the folder or adjust the code to handle only new files)
new_data_df = process_folder(for_new_files_folder_path)
# Combine data
combined_data_df = pd.concat([existing_data_df, new_data_df], ignore_index=True)
# Save updated data
save_data(combined_data_df, excel_file_path, dict_file_path)



Processing file: ~$23.09.2024.xlsx
Error processing file ~$23.09.2024.xlsx: Excel file format cannot be determined, you must specify an engine manually.
No data extracted from ~$23.09.2024.xlsx or an error occurred.

Processing file: 23.09.2019.xlsx
Currency 'Рубли' found at row 2.
Terms found for currency 'Рубли' at row 3: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Рубли' from row 4 to 7.
Currency 'Доллары' found at row 11.
Terms found for currency 'Доллары' at row 12: ['31', '91', '181', '365', '548', '730', '1100']
Extracted data for currency 'Доллары' from row 13 to 16.
Currency 'Юань' found at row 27.
Terms found for currency 'Юань' at row 28: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Юань' from row 29 to 32.
Currency 'Евро' found at row 36.
Terms found for currency 'Евро' at row 37: ['31', '3 мес.', '6 мес', '1 год', '1,5 года', '2 года', '3 года']
Extracted data for currency 'Евро' from row

Do you want to include data from 23.09.2019.xlsx? (y/n):  y


Data from 23.09.2019.xlsx accepted.
Data saved to Excel file: /Users/mark/Desktop/Новая папка 2/collected_data.xlsx
Data saved as dictionary to file: /Users/mark/Desktop/Новая папка 2/collected_data.pkl


In [None]:
import pandas as pd
import os
import re
import pickle
from datetime import datetime

def extract_currency_tables(file_path):
    """
    Extracts currency tables from the Excel file.
    """
    # Read the first sheet of the Excel file
    df = pd.read_excel(file_path, sheet_name=0, header=None)
    # Limit to columns A:J (indices 0 to 9)
    df = df.iloc[:, :10]
    
    # Initialize variables
    num_rows, num_cols = df.shape
    processing = False
    currency_tables = {}
    current_currency = None
    start_row = None
    terms = None
    term_indices = None

    for idx in range(num_rows):
        row = df.iloc[idx]
        row_values = row.fillna('').astype(str).str.strip().tolist()
        row_lower = [s.lower() for s in row_values]
        row_string = ' '.join(row_lower)

        # Stop processing if 'Накопительный счет' is found
        if 'накопительный счет' in row_string:
            print(f"'Накопительный счет' found at row {idx}. Stopping processing.")
            break

        # Check for 'Безопциональный' to start processing
        if 'безопциональный' in row_string:
            processing = True
            current_currency = None
            start_row = None
            continue

        if not processing:
            continue

        # Check for currency
        currency_match = None
        for cell in row_lower:
            m = re.match(r'фл\s*(\w+)', cell)
            if m:
                currency_match = m
                break
        if currency_match:
            current_currency = currency_match.group(1).capitalize()
            print(f"Currency '{current_currency}' found at row {idx}.")
            start_row = None
            terms = None
            term_indices = None
            continue

        if current_currency:
            # Look for 'Срок' row to get terms
            if 'срок' in row_lower:
                term_indices = [i for i, cell in enumerate(row_lower) if cell not in ['', 'срок']]
                terms = [row_values[i] for i in term_indices]
                print(f"Terms found for currency '{current_currency}' at row {idx}: {terms}")
                # Record the start row of the data (next row)
                start_row = idx + 1
                continue

            # Check for 'Среднее', 'ТС на', or empty row to end processing of current table
            if 'среднее' in row_string or 'тс на' in row_string or not any(row_values):
                if start_row is not None and terms:
                    end_row = idx
                    # Extract the table for current currency
                    currency_df = df.iloc[start_row:end_row, :]
                    # Store the DataFrame and terms in the dictionary
                    currency_tables[current_currency] = {
                        'terms': terms,
                        'term_indices': term_indices,
                        'data': currency_df.reset_index(drop=True)
                    }
                    print(f"Extracted data for currency '{current_currency}' from row {start_row} to {end_row}.")
                    # Reset for next currency
                    current_currency = None
                    start_row = None
                    terms = None
                    term_indices = None
                continue

    return currency_tables

def standardize_term(term):
    """
    Standardizes terms to days.
    """
    term = term.lower().replace(',', '.').strip()
    if 'мес' in term:
        number = re.findall(r'\d+\.?\d*', term)
        if number:
            return str(int(float(number[0]) * 30))
    elif 'год' in term or 'лет' in term:
        number = re.findall(r'\d+\.?\d*', term)
        if number:
            return str(int(float(number[0]) * 365))
    else:
        number = re.findall(r'\d+', term)
        if number:
            return number[0]
    return term  # Return the original term if it can't be standardized

def is_bank_name(cell_value):
    """
    Determines if a cell value is likely a bank name.
    """
    cell_value_lower = cell_value.lower()
    # Define a pattern for bank names, e.g., starting with 'банк'
    if re.match(r'банк', cell_value_lower):
        return True
    # Add other conditions if necessary
    return False

def process_currency_data(currency_tables, date):
    """
    Processes currency tables to extract 'Bank', 'Term', and 'Rate'.
    """
    all_data = []
    for currency, info in currency_tables.items():
        terms = [standardize_term(term) for term in info['terms']]
        term_indices = info['term_indices']
        data_df = info['data']
        num_rows, num_cols = data_df.shape

        # Attempt to identify bank name column
        bank_name_column = None

        # Check the first few rows and columns for bank names
        for idx in range(num_rows):
            row = data_df.iloc[idx]
            row_values = row.fillna('').astype(str).str.strip().tolist()
            for bank_idx in range(0, 5):  # Adjust the range as needed
                if bank_idx < len(row_values):
                    cell_value = row_values[bank_idx]
                    if cell_value and is_bank_name(cell_value):
                        bank_name_column = bank_idx
                        break
            if bank_name_column is not None:
                break

        # If bank name column not found, default to column 0
        if bank_name_column is None:
            bank_name_column = 0

        for idx in range(num_rows):
            row = data_df.iloc[idx]
            row_values = row.fillna('').astype(str).str.strip().tolist()
            if bank_name_column < len(row_values):
                bank_name = row_values[bank_name_column]
            else:
                bank_name = ''
            if bank_name == '':
                continue  # Skip rows without bank names
            # Clean bank name
            bank_name = re.sub(r'\s*\(.*?\)', '', bank_name).strip()
            rates = []
            for i in term_indices:
                if i >= len(row_values):
                    rates.append(None)
                    continue
                rate_cell = row_values[i].strip()
                rate_clean = re.sub(r'[^\d.,%]', '', rate_cell).replace(',', '.')
                if '%' in rate_clean:
                    rate_clean = rate_clean.split('%')[0]
                try:
                    rate_value = float(rate_clean)
                    if rate_value > 1:
                        rate_value /= 100
                    rates.append(rate_value)
                except ValueError:
                    rates.append(None)

            # Add data for each term
            for term, rate in zip(terms, rates):
                if rate is not None:
                    all_data.append({
                        'Date': date,
                        'Bank': bank_name,
                        'Currency': currency,
                        'Term': term,
                        'Rate': rate
                    })
    return pd.DataFrame(all_data)

def extract_deposit_rates_from_file(file_path):
    """
    Extracts deposit rates from a single file.
    """
    # Extract date from the filename
    filename = os.path.basename(file_path)
    date_match = re.search(r'\d{2}\.\d{2}\.\d{2,4}', filename)
    if date_match:
        date_str = date_match.group(0)
        # Parse date, trying both '%d.%m.%Y' and '%d.%m.%y'
        try:
            date_obj = datetime.strptime(date_str, '%d.%m.%Y')
        except ValueError:
            try:
                date_obj = datetime.strptime(date_str, '%d.%m.%y')
            except ValueError:
                print(f"Date format not recognized in filename {filename}. Skipping file.")
                return pd.DataFrame()
        date = date_obj.strftime('%d.%m.%Y')  # Format date as 'DD.MM.YYYY'
    else:
        print(f"Date not found in filename {filename}. Skipping file.")
        return pd.DataFrame()
    
    # Extract currency tables
    try:
        currency_tables = extract_currency_tables(file_path)
    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        return pd.DataFrame()
    
    # Process currency data to get the final DataFrame
    result_df = process_currency_data(currency_tables, date)
    return result_df

def process_folder(folder_path):
    """
    Processes all Excel files in a folder, allowing you to review and accept/reject data from each file.
    Returns a DataFrame with the collected data.
    """
    all_data = []
    # Iterate over all Excel files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx') or filename.endswith('.xls'):
            file_path = os.path.join(folder_path, filename)
            print(f"\nProcessing file: {filename}")
            # Extract data from the file
            data = extract_deposit_rates_from_file(file_path)
            if data.empty:
                print(f"No data extracted from {filename} or an error occurred.")
                continue
            # Display the extracted data for review
            print(f"Extracted data from {filename}:")
            print(data)
            # Prompt user to accept or reject the data
            while True:
                user_input = input(f"Do you want to include data from {filename}? (y/n): ").strip().lower()
                if user_input == 'y':
                    all_data.append(data)
                    print(f"Data from {filename} accepted.")
                    break
                elif user_input == 'n':
                    print(f"Data from {filename} rejected.")
                    break
                else:
                    print("Invalid input. Please enter 'y' or 'n'.")
    # Concatenate all accepted data
    if all_data:
        result_df = pd.concat(all_data, ignore_index=True)
    else:
        result_df = pd.DataFrame()
    return result_df

def save_data(data_df, excel_file_path, dict_file_path):
    """
    Saves the data DataFrame to an Excel file and as a pickle file.
    """
    # Save DataFrame to Excel
    data_df.to_excel(excel_file_path, index=False)
    print(f"Data saved to Excel file: {excel_file_path}")
    # Save DataFrame as a dictionary to a pickle file
    data_dict = data_df.to_dict('records')
    with open(dict_file_path, 'wb') as f:
        pickle.dump(data_dict, f)
    print(f"Data saved as dictionary to file: {dict_file_path}")

def load_data(dict_file_path):
    """
    Loads data from a pickle file and returns it as a DataFrame.
    """
    with open(dict_file_path, 'rb') as f:
        data_dict = pickle.load(f)
    data_df = pd.DataFrame(data_dict)
    return data_df

# Example usage:
if __name__ == "__main__":
    folder_path = '/path/to/your/folder'  # Replace with your folder path
    # Process all files in the folder
    collected_data = process_folder(folder_path)
    # Save the data
    if not collected_data.empty:
        excel_file_path = '/path/to/save/collected_data.xlsx'  # Replace with your desired Excel file path
        dict_file_path = '/path/to/save/collected_data.pkl'    # Replace with your desired pickle file path
        save_data(collected_data, excel_file_path, dict_file_path)
    else:
        print("No data was collected.")

    # Later, to add new data from new files:
    # Load existing data
    existing_data_df = load_data(dict_file_path)
    # Process new files (ensure that only new files are in the folder or adjust the code to handle only new files)
    new_data_df = process_folder(folder_path)
    # Combine data
    combined_data_df = pd.concat([existing_data_df, new_data_df], ignore_index=True)
    # Save updated data
    save_data(combined_data_df, excel_file_path, dict_file_path)
