note: 假设数值列从'Sex'之后开始

In [None]:
import pandas as pd

def extract_feature_tables_auto(excel_path, sheet_name=0, skiprows=6, feature_col='Feature', sex_col='Sex'):
    df = pd.read_excel(excel_path, sheet_name=sheet_name, skiprows=skiprows)
    # 自动检测数值列起始
    cols = df.columns.tolist()
    class_idx = cols.index(feature_col)
    sex_idx = cols.index(sex_col)
    value_cols = cols[max(class_idx, sex_idx)+1:]
    selected_cols = [feature_col, sex_col] + value_cols
    result_df = df[selected_cols].copy()
    result_df = result_df.dropna(how='all').reset_index(drop=True)
    for col in value_cols:
        result_df = result_df[pd.to_numeric(result_df[col], errors='coerce').notnull()]
        
    n_values = len(value_cols)
    half = n_values // 2
    jobs_cols = value_cols[:half]
    income_cols = value_cols[half:]

    records = []
    for _, row in result_df.iterrows():
        feature = row[feature_col]
        sex = row[sex_col]
        for i, year in enumerate(jobs_cols):
            records.append({
                'feature': feature,
                'year': year,
                'gender': sex,
                'number_of_jobs': float(row[year]) / 1000,
                'median_income': row[income_cols[i]]
            })
    return pd.DataFrame(records)



In [25]:
def save_csv_reader(df, feature_name):
    filename = f"{feature_name}_gender.csv"
    df.to_csv(f"../../data/transformed/{filename}", index=False)
    print(f"{feature_name} 已保存为 {filename}")


In [26]:
feature_name = 'Visa group'
df_long = extract_feature_tables_auto(
    '../../abs_raw_data/Table 17 - Migrants, Jobs and employment income by sex, state and territory, industry, and visa group, 2017-18 to 2021-22.xlsx',
    sheet_name='Table 17',
    skiprows=6,
    feature_col=feature_name,
    sex_col='Sex'
)
df_long = df_long[df_long['feature'] != 'Total'].reset_index(drop=True)
save_csv_reader(df_long, feature_name)

Visa group 已保存为 Visa group_gender.csv


In [29]:
feature_name = 'Job duration'
df_long = extract_feature_tables_auto(
    '../../abs_raw_data/Table 20 - Migrants, Jobs and employment income by arrival group, sex and job duration, 2017-18 to 2021-22.xlsx',
    sheet_name='Table 20',
    skiprows=6,
    feature_col=feature_name,
    sex_col='Sex'
)
df_long = df_long[df_long['feature'] != 'Total'].reset_index(drop=True)
save_csv_reader(df_long, feature_name)


Job duration 已保存为 Job duration_gender.csv


In [30]:
feature_name = 'Business size'
df_long = extract_feature_tables_auto(
    '../../abs_raw_data/Table 21 - Migrants, Jobs and employment income by arrival group, sex and business employment size, 2017-18 to 2021-22.xlsx',
    sheet_name='Table 21',
    skiprows=6,
    feature_col=feature_name,
    sex_col='Sex'
)
df_long = df_long[df_long['feature'] != 'Total'].reset_index(drop=True)
save_csv_reader(df_long, feature_name)


Business size 已保存为 Business size_gender.csv


In [37]:
feature_name = 'Visa group'
df_long_arrival1 = extract_feature_tables_auto(
    '../../abs_raw_data/Table 17 - Migrants, Jobs and employment income by sex, state and territory, industry, and visa group, 2017-18 to 2021-22.xlsx',
    sheet_name='Table 17.1',
    skiprows=6,
    feature_col=feature_name,
    sex_col='Sex'
)
df_long_arrival2 = extract_feature_tables_auto(
    '../../abs_raw_data/Table 17 - Migrants, Jobs and employment income by sex, state and territory, industry, and visa group, 2017-18 to 2021-22.xlsx',
    sheet_name='Table 17.2',
    skiprows=6,
    feature_col=feature_name,
    sex_col='Sex'
)
df_long_arrival3 = extract_feature_tables_auto(
    '../../abs_raw_data/Table 17 - Migrants, Jobs and employment income by sex, state and territory, industry, and visa group, 2017-18 to 2021-22.xlsx',
    sheet_name='Table 17.3',
    skiprows=6,
    feature_col=feature_name,
    sex_col='Sex'
)
arrival_len = ['0-5 years', '6-10 years', '11+ years']
df_long_arrival1['feature'] = arrival_len[0]
df_long_arrival2['feature'] = arrival_len[1]
df_long_arrival3['feature'] = arrival_len[2]
df_long = pd.concat([df_long_arrival1, df_long_arrival2, df_long_arrival3])
save_csv_reader(df_long, "Arrival Group")

Arrival Group 已保存为 Arrival Group_gender.csv
