#take data from each file
#delete first 8 rows
#Delete everything from column A that is not a date
#make sure dates are in the same format
#order everything by dates
#keep only one header 
#I want to make sure there are no missing months


In [5]:
import os
import pandas as pd
from datetime import datetime
from Constants_for_smaller_files import stations_mapping

def clean_excel_files(directory,station_name):
    all_data = []
    files = [f for f in os.listdir(directory) if f.endswith('.xlsx') and not f.startswith('~$')]
    
    for file in files:
        file_path = os.path.join(directory, file)
        try:
            df = pd.read_excel(file_path, engine='openpyxl')
        except Exception as e:
            print(f"Skipping file {file} due to error: {e}")
            continue
        
        # Remove rows 1-8
        df = df.iloc[8:]

        mask_8888 = df == 8888
        mask_9999 = df == 9999
        
        # Sets the values to pd.NA where the mask is True.
        df = df.mask(mask_8888 | mask_9999, pd.NA)
        
        # Select columns B to J (assuming 0-based index, so columns 1 to 9)
        columns_to_convert = df.columns[1:10]
        
        # Convert these specific columns to numeric, setting errors='coerce' to handle non-numeric data
        df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')
        
        # Interpolate the selected columns
        df[columns_to_convert] = df[columns_to_convert].interpolate(axis=0)


        # Convert column A to datetime, setting errors='coerce' to handle non-date entries
        df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], format='%d-%m-%Y', errors='coerce')
        
        # Drop rows where the first column (dates) is NaT (Not a Time)
        df = df.dropna(subset=[df.columns[0]])
        
        # Rename the first column to 'Date'
        df = df.rename(columns={
            df.columns[0]: 'Date',          
            df.columns[1]: f'Tn({station_name})',           
            df.columns[2]: f'Tx({station_name})',
            df.columns[3]: f'Tavg({station_name})',          
            df.columns[4]: f'RH_avg({station_name})',           
            df.columns[5]: f'RR({station_name})',
            df.columns[6]: f'ss({station_name})',          
            df.columns[7]: f'ff_x({station_name})',           
            df.columns[8]: f'ddd_x({station_name})',
            df.columns[9]: f'ff_avg({station_name})',           
            df.columns[10]: f'ddd_car({station_name})'
        })
        
        # Reset index
        df = df.reset_index(drop=True)
        
        all_data.append(df)

        df.drop(df.columns[10], axis=1, inplace=True)
    
    if not all_data:
        print("No valid Excel files found.")
        return None
    
    # Merge all data
    merged_df = pd.concat(all_data, ignore_index=True)
    
    # Check if 'Date' column exists
    if 'Date' not in merged_df.columns:
        print("The 'Date' column is missing in the merged DataFrame.")
        return None
    
    # Sort by Date
    merged_df = merged_df.sort_values(by='Date')
    
    # Reset index after sorting
    merged_df = merged_df.reset_index(drop=True)

    merged_df = merged_df.drop_duplicates(subset=['Date'])
    
    return merged_df


def main():
    try:
        for station_name, directory in stations_mapping.items():
            merged_data = clean_excel_files(directory,station_name)
            if merged_data is not None:
                # Save the merged data to a new Excel file
                merged_data.to_excel(f'cleaned_data_{station_name}.xlsx', index=False)
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


In [6]:
import pandas as pd
import os

# Directory containing the Excel files
directory = './'

# List to hold dataframes
dfs = []

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".xlsx") or filename.endswith(".xls"):
        if filename.startswith("~$"):
            # Skip temporary files created by Excel
            continue
        file_path = os.path.join(directory, filename)
        try:
            # Read each Excel file, specifying the engine
            df = pd.read_excel(file_path, engine='openpyxl')
            # Check if the 'Date' column exists
            if 'Date' in df.columns:
                dfs.append(df)
            else:
                print(f"Skipping {file_path}: 'Date' column not found")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

if len(dfs) > 0:
    # Merge dataframes on the 'Date' column
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on='Date', how='outer')

    # Sort the merged dataframe by date
    merged_df = merged_df.sort_values(by='Date')

    # Fill missing values using forward fill and backward fill
    merged_df = merged_df.fillna(method='ffill').fillna(method='bfill')

    na_positions = merged_df.isna()

    # Save the positions of the NAs (True indicates an NA position)
    na_positions.to_excel('na_positions.xlsx', index=False)

    # Save the summary of NAs (count of NAs per column)
    na_summary = merged_df.isna().sum()
    na_summary.to_excel('na_summary.xlsx', header=["Count"])

    # Save the merged dataframe with filled values to a new Excel file
    merged_df.to_excel('All_Files_Merged.xlsx', index=False)

    print("NA positions saved to 'na_positions.xlsx'")
    print("NA summary saved to 'na_summary.xlsx'")
    print("Merged file with filled values saved to 'merged_file_filled.xlsx'")
else:
    print("No valid Excel files found in the directory.")


NA positions saved to 'na_positions.xlsx'
NA summary saved to 'na_summary.xlsx'
Merged file with filled values saved to 'merged_file_filled.xlsx'
