In [None]:
import pandas as pd

# Define the path to the Excel file
file_path = "study_permits_by_country_2015_2025.xlsx"

try:
    # Read the Excel file with specific parameters:
    # skiprows=2: Skip the first 2 rows. These rows likely contain introductory information
    #            or headers that are not part of the main data structure.
    # skipfooter=7: Skip the last 7 rows (footer). These rows probably contain
    #             summary statistics, notes, or other information not needed for the analysis.
    # header=[0, 1, 2]: Set the header as the rows at index 0, 1, and 2 (after skipping the initial rows).
    #                  These three rows together form the MultiIndex for the column names,
    #                  representing Year, Quarter, and Month/Total.
    df = pd.read_excel(file_path, skiprows=2, skipfooter=7, header=[0, 1, 2])

    # Function to flatten the MultiIndex columns into a single string
    def flatten_multiindex(col):
        if isinstance(col, tuple):
            # Join the parts of the tuple with ' - ' if they are not 'Unnamed'.
            # 'Unnamed' usually indicates merged cells in the original Excel sheet
            # that do not contain specific header information at that level.
            return ' - '.join(str(x) for x in col if 'Unnamed' not in str(x))
        return col

    # Apply the flatten_multiindex function to the columns to create simpler names
    df.columns = df.columns.map(flatten_multiindex)

    # Rename the first column, which contains the country names.
    # The original MultiIndex for this column spanned three rows with 'Unnamed' levels.
    df.rename(columns={'Country of Citizenship - Unnamed: 0_level_1 - Unnamed: 0_level_2': 'Country of Citizenship'}, inplace=True)

    # Display the cleaned column names to verify the transformation
    print("Cleaned Column Names:")
    print(df.columns)

    # Display the first few rows of the cleaned DataFrame to inspect the data structure
    print("\nFirst few rows of Cleaned DataFrame:")
    print(df.head())

except FileNotFoundError:
    # Handle the error if the specified file path does not exist
    print(f"Error: File '{file_path}' not found. Please check the path.")
except Exception as e:
    # Handle other potential errors that might occur during file reading
    print(f"An error occurred while reading the file: {e}")

In [None]:
# Identify columns containing 'Total'
columns_to_drop = [col for col in df.columns if 'Total' in col]

# Drop the identified columns
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

# Display the remaining column names
print("\nColumn names after removing 'Total' columns:")
print(df_cleaned.columns)

# Display the first few rows of the cleaned DataFrame
print("\nFirst few rows of DataFrame after removing 'Total' columns:")
print(df_cleaned.head())

In [None]:
# Melt the DataFrame to long format
df_long = pd.melt(df_cleaned,
                  id_vars=['Country of Citizenship'],
                  var_name='Period',
                  value_name='Number of Students')

# Display the first few rows of the long format DataFrame
print("\nFirst few rows of DataFrame in long format (df_long):")
print(df_long.head())

# Display the shape of the long format DataFrame
print("\nShape of the long format DataFrame (df_long):", df_long.shape)

In [None]:
# Function to extract Year, Quarter, and Month from the 'Period' column
def extract_period_info(period_str):
    parts = period_str.split(' - ')
    year = None
    quarter = None
    month = None
    if len(parts) == 3:
        if parts[0].isdigit():
            year = int(parts[0])
        if parts[1].startswith('Q'):
            quarter = parts[1]
        month = parts[2]
    return pd.Series([year, quarter, month])

# Apply the function to create new 'Year', 'Quarter', and 'Month' columns
df_long[['Year', 'Quarter', 'Month']] = df_long['Period'].apply(extract_period_info)

# Display the first few rows of df_long with the new columns
print("\nFirst few rows of df_long with 'Year', 'Quarter', 'Month' columns:")
print(df_long)

# Display the data types of the new columns
print("\nData types of 'Year', 'Quarter', 'Month' columns:")
print(df_long[['Year', 'Quarter', 'Month']].dtypes)