In [1]:
import os
import pandas as pd

def combine_files(file1, file2, combined_file_path):
    try:
        # Try reading the files as Excel first
        try:
            df1 = pd.read_excel(file1, engine='xlrd')
        except Exception as e:
            print(f"Error reading {file1} as Excel: {e}")
            # Attempt to read as HTML if it's not an Excel file
            try:
                df1 = pd.read_html(file1)[0]
            except Exception as e:
                # If it's not HTML, attempt to read as CSV (in case it's CSV or tab-separated)
                try:
                    df1 = pd.read_csv(file1, low_memory=False)  # Handling mixed types by disabling low memory
                except Exception as e:
                    print(f"Failed to read {file1} as CSV or HTML: {e}")
                    return
        
        try:
            df2 = pd.read_excel(file2, engine='xlrd')
        except Exception as e:
            print(f"Error reading {file2} as Excel: {e}")
            # Attempt to read as HTML if it's not an Excel file
            try:
                df2 = pd.read_html(file2)[0]
            except Exception as e:
                # If it's not HTML, attempt to read as CSV
                try:
                    df2 = pd.read_csv(file2, low_memory=False)  # Handling mixed types by disabling low memory
                except Exception as e:
                    print(f"Failed to read {file2} as CSV or HTML: {e}")
                    return

        # Combine them by concatenating rows
        df_combined = pd.concat([df1, df2], ignore_index=True)
        
        # Check if it's in CSV or HTML format, then save accordingly
        if isinstance(df_combined, pd.DataFrame):
            df_combined.to_csv(combined_file_path, index=False)  # Save as CSV
            print(f"Combined file saved as {combined_file_path}")
        else:
            print(f"Unsupported format for combined saving")
            
    except Exception as e:
        print(f"Failed to combine {file1} and {file2}: {e}")

def convert_and_combine_files(folder_path):
    # List to store file paths of the Excel files to be combined
    files_to_combine = []
    
    # Walk through the directory to find .xls, .xlsx, .csv, or .html files
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.xls') or file.endswith('.xlsx') or file.endswith('.csv') or file.endswith('.html'):
                # Define the full path to the file
                file_path = os.path.join(root, file)
                
                # Store the file path for later combination
                files_to_combine.append(file_path)

    # If there are at least two files, combine them
    if len(files_to_combine) >= 2:
        files_to_combine.sort()  # Sort the list of files (optional, can be based on date or name)
        file1 = files_to_combine[0]  # First file
        file2 = files_to_combine[1]  # Second file
        
        # Define the output file path for the combined file
        combined_file = os.path.join(folder_path, '07012025.csv')  # Saving as CSV by default
        
        # Combine the two files
        combine_files(file1, file2, combined_file)
    else:
        print("Not enough files found to combine.")

# Use a raw string for the folder path
folder_path = r'F:\DATA TEAM\Process NSE\Cumulative File'  # Use your folder path
convert_and_combine_files(folder_path)


Error reading F:\DATA TEAM\Process NSE\Cumulative File\890_07012025_110207763525548783_HARMIT.xls as Excel: Unsupported format, or corrupt file: Expected BOF record; found b'\r\n\r\n<TAB'
Error reading F:\DATA TEAM\Process NSE\Cumulative File\890_07012025_11110350099493241353_HARMIT.xls as Excel: Unsupported format, or corrupt file: Expected BOF record; found b'\r\n\r\n<TAB'
Combined file saved as F:\DATA TEAM\Process NSE\Cumulative File\07012025.csv


In [1]:
import os
import pandas as pd

def combine_files(file_list, combined_file_path):
    try:
        # List to store dataframes
        df_list = []
        
        # Iterate over all files in the list and read them
        for file in file_list:
            try:
                # Try reading the file as Excel first
                df = pd.read_excel(file, engine='xlrd')
            except Exception as e:
                print(f"Error reading {file} as Excel: {e}")
                # Attempt to read as HTML if it's not an Excel file
                try:
                    df = pd.read_html(file)[0]
                except Exception as e:
                    # If it's not HTML, attempt to read as CSV
                    try:
                        df = pd.read_csv(file, low_memory=False)  # Handling mixed types by disabling low memory
                    except Exception as e:
                        print(f"Failed to read {file} as CSV or HTML: {e}")
                        continue  # Skip this file if it can't be read

            # Add the dataframe to the list
            df_list.append(df)
        
        # Combine all dataframes in the list by concatenating rows
        if df_list:
            df_combined = pd.concat(df_list, ignore_index=True)
            
            # Check if it's a DataFrame, then save accordingly
            df_combined.to_csv(combined_file_path, index=False)  # Save as CSV
            print(f"Combined file saved as {combined_file_path}")
        else:
            print("No valid files were read to combine.")
            
    except Exception as e:
        print(f"Failed to combine files: {e}")

def convert_and_combine_files(folder_path):
    # List to store file paths of the Excel files to be combined
    files_to_combine = []
    
    # Walk through the directory to find .xls, .xlsx, .csv, or .html files
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.xls') or file.endswith('.xlsx') or file.endswith('.csv') or file.endswith('.html'):
                # Define the full path to the file
                file_path = os.path.join(root, file)
                
                # Store the file path for later combination
                files_to_combine.append(file_path)

    # If there are at least four files, combine them
    if len(files_to_combine) >= 4:
        files_to_combine.sort()  # Sort the list of files (optional, can be based on date or name)
        
        # Define the output file path for the combined file
        combined_file = os.path.join(folder_path, '17012025.csv')  # Saving as CSV by default
        
        # Combine the four files
        combine_files(files_to_combine[:4], combined_file)  # Use the first 4 files
    else:
        print("Not enough files found to combine.")

# Use a raw string for the folder path
folder_path = r'F:\DATA TEAM\Process NSE\Cumulative File'  # Use your folder path
convert_and_combine_files(folder_path)


Error reading F:\DATA TEAM\Process NSE\Cumulative File\1) 890_18012025_11163236512775145218_HARMIT.xls as Excel: Unsupported format, or corrupt file: Expected BOF record; found b'\r\n\r\n<TAB'
Error reading F:\DATA TEAM\Process NSE\Cumulative File\3) 890_18012025_1121019161214465838_HARMIT.xls as Excel: Unsupported format, or corrupt file: Expected BOF record; found b'\r\n\r\n<TAB'
Error reading F:\DATA TEAM\Process NSE\Cumulative File\890_18012025_11272062840443677583_HARMIT.xls as Excel: Unsupported format, or corrupt file: Expected BOF record; found b'\r\n\r\n<TAB'
Error reading F:\DATA TEAM\Process NSE\Cumulative File\890_18012025_11314321936082863319_HARMIT.xls as Excel: Unsupported format, or corrupt file: Expected BOF record; found b'\r\n\r\n<TAB'
Combined file saved as F:\DATA TEAM\Process NSE\Cumulative File\17012025.csv
