In [6]:
import pandas as pd

# Define the path to the Excel file
file_path = r"D:\study-permits-analysis\study_permits_by_country_2015_2025.xlsx"

try:
    # Read the Excel file with specific parameters:
    # skiprows=2: Skip the first 2 rows. These rows likely contain introductory information
    #            or headers that are not part of the main data structure.
    # skipfooter=7: Skip the last 7 rows (footer). These rows probably contain
    #             summary statistics, notes, or other information not needed for the analysis.
    # header=[0, 1, 2]: Set the header as the rows at index 0, 1, and 2 (after skipping the initial rows).
    #                  These three rows together form the MultiIndex for the column names,
    #                  representing Year, Quarter, and Month/Total.
    df = pd.read_excel(file_path, skiprows=2, skipfooter=7, header=[0, 1, 2])

    # Function to flatten the MultiIndex columns into a single string
    def flatten_multiindex(col):
        if isinstance(col, tuple):
            # Join the parts of the tuple with ' - ' if they are not 'Unnamed'.
            # 'Unnamed' usually indicates merged cells in the original Excel sheet
            # that do not contain specific header information at that level.
            return ' - '.join(str(x) for x in col if 'Unnamed' not in str(x))
        return col

    # Apply the flatten_multiindex function to the columns to create simpler names
    df.columns = df.columns.map(flatten_multiindex)

    # Rename the first column, which contains the country names.
    # The original MultiIndex for this column spanned three rows with 'Unnamed' levels.
    df.rename(columns={'Country of Citizenship - Unnamed: 0_level_1 - Unnamed: 0_level_2': 'Country of Citizenship'}, inplace=True)

    # Display the cleaned column names to verify the transformation
    print("Cleaned Column Names:")
    print(df.columns)

    # Display the first few rows of the cleaned DataFrame to inspect the data structure
    print("\nFirst few rows of Cleaned DataFrame:")
    print(df.head())

except FileNotFoundError:
    # Handle the error if the specified file path does not exist
    print(f"Error: File '{file_path}' not found. Please check the path.")
except Exception as e:
    # Handle other potential errors that might occur during file reading
    print(f"An error occurred while reading the file: {e}")

Cleaned Column Names:
Index(['Country of Citizenship', '2015 - Q1 - Jan', '2015 - Q1 - Feb',
       '2015 - Q1 - Mar', '2015 - Q1 - Q1 Total', '2015 - Q2 - Apr',
       '2015 - Q2 - May', '2015 - Q2 - Jun', '2015 - Q2 - Q2 Total',
       '2015 - Q3 - Jul',
       ...
       '2024 - Q3 - Sep', '2024 - Q3 - Q3 Total', '2024 - Q4 - Oct',
       '2024 - Q4 - Nov', '2024 - Q4 - Dec', '2024 - Q4 - Q4 Total',
       '2024 - 2024 Total', '2025 - Q1 - Jan', '2025 - Q1 - Q1 Total',
       '2025 - 2025 Total'],
      dtype='object', length=174)

First few rows of Cleaned DataFrame:
  Country of Citizenship 2015 - Q1 - Jan 2015 - Q1 - Feb 2015 - Q1 - Mar  \
0            Afghanistan              10              --              --   
1                Albania              10               5              --   
2                Algeria              60              40              55   
3                Andorra              --               0               0   
4                 Angola              15  