In [2]:
import pandas as pd
import numpy as np

In [3]:
RELATIVE_IN = "1. landing/2. csv"
RELATIVE_OUT = "2. raw"

### Functions

In [4]:
def symmetric_difference(list_1, list_2):
    return list((set(list_1) - set(list_2)) | (set(list_2) - set(list_1)))

In [5]:
def get_rename(df, rename_dict):
    df = df.copy()

    # rename the dictionary
    df.rename(columns=rename_dict, inplace=True)

    # make lower case and strip
    df.rename(columns={x: x.lower().strip() for x in df.columns}, inplace=True)

    return df

## Offers

#### Renaming

In [16]:
offers_2022_raw = pd.read_csv(f"{RELATIVE_IN}/offers 2022.csv", index_col=0)
offers_2023_raw = pd.read_csv(f"{RELATIVE_IN}/offers 2023.csv", index_col=0)

# get the shapes
print(f"2022 shape: {offers_2022_raw.shape}")
print(f"2023 shape: {offers_2023_raw.shape}")

# get the symmetric difference
#symmetric_difference(offers_2022_raw.columns, offers_2023_raw.columns)

2022 shape: (443, 27)
2023 shape: (509, 28)


In [17]:
rename_dict_2022 = {
    "GAMSAT S1 Score": "s1 score",
    "GAMSAT S2 Score": "s2 score",
    "GAMSAT S3 Score": "s3 score",

    "Place Type": "offer uni place type",
    "GPA": "offer uni gpa",
    "GAMSAT": "offer uni gamsat",
    "Preference": "offer uni preference",

    "Interviewing Uni": "interview uni",
    "GPA.1": "interview uni gpa",
    "GAMSAT.1": "interview uni gamsat",
    "Preference.1": "interview uni preference",

    "UQ Metro/RMP": "UQ type"
}

In [18]:
offers_2022 = get_rename(offers_2022_raw, rename_dict_2022)
offers_2022["year"] = 2022

In [19]:
rename_dict_2023 = {
    "S1 Score": "s1 score",
    "S2 Score": "s2 score",
    "S3 Score\n": "s3 score",

    "Offering Uni": "offer uni",
    "Place Type": "offer uni place type",
    "GPA": "offer uni gpa",
    "GAMSAT": "offer uni gamsat",
    "Preference": "offer uni preference",

    "Interviewing Uni": "interview uni",
    "GPA.1": "interview uni gpa",
    "GAMSAT.1": "interview uni gamsat",
    "Preference.1": "interview uni preference",

    "GEMSAS vs Other?": "gemsas over other?",
    "UQ MP/RMP Tier": "UQ RMP Tier",
    "Places?": "places selected",

    "MMI Opinion": "interview opinion",
    "MMI Prep (hrs)": "interview prep hours"
}

In [20]:
offers_2023 = get_rename(offers_2023_raw, rename_dict_2023)
offers_2023["year"] = 2023

#### Merging

In [21]:
print(offers_2022.columns.duplicated().sum())
print(offers_2023.columns.duplicated().sum())

0
0


In [22]:
offers_raw = pd.concat([offers_2022, offers_2023], ignore_index=True)

print(offers_raw.shape)
print(offers_raw.dtypes)
offers_raw.head(4)

(952, 31)
timestamp                    object
rurality                     object
s1 score                      int64
s2 score                      int64
s3 score                      int64
uw gamsat                   float64
w gamsat                    float64
offer uni                    object
offer uni gpa               float64
offer uni gamsat            float64
offer uni place type         object
offer uni preference        float64
interviewed?                 object
interview uni                object
interview uni gpa           float64
interview uni gamsat        float64
places selected              object
interview uni preference    float64
deakin bonus                  int64
anu bonus                     int64
mq bonus (gpa)                int64
casper quartile              object
uq rmp tier                  object
uq type                      object
gemsas over other?           object
interview opinion            object
notes                        object
year              

Unnamed: 0,timestamp,rurality,s1 score,s2 score,s3 score,uw gamsat,w gamsat,offer uni,offer uni gpa,offer uni gamsat,...,casper quartile,uq rmp tier,uq type,gemsas over other?,interview opinion,notes,year,interview prep hours,other rejections,status
0,10/27/2022 10:11:29,Non-Rural,66,74,69,69.666667,69.5,Deakin University,5.693,69.5,...,4th,Tier 3,,Yes,Very well,\n,2022,,,
1,10/27/2022 10:11:54,Non-Rural,57,66,99,74.0,80.25,Deakin University,6.56,80.25,...,,Tier 3,,Yes,Unsure,\n,2022,,,
2,10/27/2022 10:12:51,Non-Rural,74,83,80,79.0,79.25,,,,...,,,,,Well,\n,2022,,,
3,10/27/2022 10:13:20,Non-Rural,62,68,74,68.0,69.5,,,,...,,,,,Poorly,\n,2022,,,


In [23]:
offers_raw.to_csv(f"{RELATIVE_OUT}/offers.csv")

#### Checking

In [123]:
offers_raw.tail(3).iloc[:, 15:]

Unnamed: 0,interview uni gamsat,places selected,interview uni preference,deakin bonus,anu bonus,mq bonus (gpa),casper quartile,uq rmp tier,uq type,gemsas over other?,interview opinion,notes,year,interview prep hours,other rejections,status
949,72.0,CSP,1.0,4,0,0,2nd,Tier 1 (CQ-WB RMP),,,Very poorly,,2023,26-50,"N/A, UoW",
950,,"CSP, BMP",,0,0,0,,,,Yes,Well,,2023,101-250,,
951,80.33,"CSP, BMP, FFP",5.0,0,0,0,,,,Yes,Very well,,2023,26-50,,


In [124]:
offers_raw.head(4).iloc[:, 15:]

Unnamed: 0,interview uni gamsat,places selected,interview uni preference,deakin bonus,anu bonus,mq bonus (gpa),casper quartile,uq rmp tier,uq type,gemsas over other?,interview opinion,notes,year,interview prep hours,other rejections,status
0,69.5,All,1.0,2,2,0,4th,Tier 3,,Yes,Very well,\n,2022,,,
1,,,,0,0,0,,Tier 3,,Yes,Unsure,\n,2022,,,
2,79.0,All,5.0,2,2,0,,,,,Well,\n,2022,,,
3,69.5,CSP & BMP Only,1.0,2,2,0,,,,,Poorly,\n,2022,,,


## Interviews

#### Renaming

In [6]:
interview_2022_raw = pd.read_csv(f"{RELATIVE_IN}/interview 2022.csv", index_col=0)
interview_2023_raw = pd.read_csv(f"{RELATIVE_IN}/interview 2023.csv", index_col=0)
interview_2024_raw = pd.read_csv(f"{RELATIVE_IN}/interview 2024.csv", index_col=0)

# get the shapes
print(f"2022 shape: {interview_2022_raw.shape}")
print(f"2023 shape: {interview_2023_raw.shape}")
print(f"2024 shape: {interview_2024_raw.shape}")

# get the symmetric difference
#symmetric_difference(offers_2022_raw.columns, offers_2023_raw.columns)

2022 shape: (493, 34)
2023 shape: (511, 36)
2024 shape: (529, 41)


In [7]:
# get the uni names
def change_preference_names(dict, df):
    # get the names
    preference_names = [x for x in df.columns if ("pref" in x.lower()) and ("uni" in x.lower())]

    # update the dictionary
    new_dict_values = {preference_names[i]: f"pref {i+1} uni" for i in range(len(preference_names))}
    dict.update(new_dict_values)

In [8]:
#interview_2022_raw.columns

In [9]:
rename_dict_2022_interview = {
    "GAMSAT S1 Score": "s1 score",
    "GAMSAT S2 Score": "s2 score",
    "GAMSAT S3 Score": "s3 score",

    "Interviewing Uni": "interview uni",
    "GPA": "interview uni gpa",
    "GAMSAT": "interview uni gamsat",

    "MQ Bonus (GPA)": "MQ Bonus"
}

rename_dict_2022_interview.update(
    {f"{metric}.{i}": f"Pref {i} {metric}" for i in range(1, 7) for metric in ["GPA", "GAMSAT"]}
)

change_preference_names(rename_dict_2022_interview, interview_2022_raw)

In [10]:
#interview_2023_raw.columns

In [11]:
rename_dict_2023_interview = {
    "Interviewing Uni": "interview uni",
    "GPA": "interview uni gpa",
    "GAMSAT": "interview uni gamsat",
    
    "GPA.1": "interview uni gpa",
    "GAMSAT.1": "interview uni gamsat",
    "Preference.1": "interview uni preference",

    "MQ Bonus (GPA)": "MQ Bonus",
    "UQ MP/RMP Tier": "UQ tier",
}

rename_dict_2023_interview.update(
    {f"{metric}.{i}": f"Pref {i} {metric}" for i in range(1, 7) for metric in ["GPA", "GAMSAT"]}
)
rename_dict_2023_interview.update(
    {"GAMSAT\n": "Pref 6 GAMSAT"}
)

change_preference_names(rename_dict_2023_interview, interview_2023_raw)

In [12]:
#interview_2024_raw.columns

In [13]:
rename_dict_2024_interview = {
    "S1": "s1 score",
    "S2": "s2 score",
    "S3": "s3 score",

    "Interviewing Uni": "interview uni",
    "GPA": "interview uni gpa",
    "GAMSAT": "interview uni gamsat",

    "Interviewing Uni": "interview uni",
    
    "GPA.1": "interview uni gpa",
    "GAMSAT.1": "interview uni gamsat",
    "Preference.1": "interview uni preference",

    "ANU Bonuses": "ANU Bonus",
    "UQ Metro/RMP": "UQ type",
    "UQ MP/RMP Tier": "UQ tier",
    'Deakin RTS Tier': "Deakin tier",

    "Notes\n": "Notes"
}

rename_dict_2024_interview.update(
    {f"{metric}.{i}": f"Pref {i} {metric}" for i in range(1, 7) for metric in ["GPA", "GAMSAT"]}
)

change_preference_names(rename_dict_2024_interview, interview_2024_raw)

In [14]:
interview_2022 = get_rename(interview_2022_raw, rename_dict_2022_interview)
interview_2022["year"] = 2022

interview_2023 = get_rename(interview_2023_raw, rename_dict_2023_interview)
interview_2023["year"] = 2023

interview_2024 = get_rename(interview_2024_raw, rename_dict_2024_interview)
interview_2024["year"] = 2024

In [15]:
symmetric_difference(interview_2023.columns, interview_2024.columns)

['location',
 'interview?',
 'unimelb gam',
 'unds bonuses',
 'deakin tier',
 'undf bonuses',
 'uow bonuses']

#### Merging

In [16]:
print(interview_2022.columns.duplicated().sum())
print(interview_2023.columns.duplicated().sum())
print(interview_2024.columns.duplicated().sum())

0
0
0


In [17]:
interview_raw = pd.concat([interview_2022, interview_2023, interview_2024], ignore_index=True)

print(interview_raw.shape)
print(interview_raw.dtypes)
interview_raw.head(4)

(1533, 44)
timestamp                object
rurality                 object
s1 score                  int64
s2 score                  int64
s3 score                  int64
uw gamsat               float64
w gamsat                float64
deakin bonus              int64
anu bonus                 int64
mq bonus                  int64
notes                    object
interview?               object
interview uni            object
interview uni gpa       float64
interview uni gamsat    float64
pref 1 uni               object
pref 1 gpa              float64
pref 1 gamsat           float64
pref 2 uni               object
pref 2 gpa              float64
pref 2 gamsat           float64
pref 3 uni               object
pref 3 gpa              float64
pref 3 gamsat           float64
pref 4 uni               object
pref 4 gpa              float64
pref 4 gamsat           float64
pref 5 uni               object
pref 5 gpa              float64
pref 5 gamsat           float64
pref 6 uni               obje

Unnamed: 0,timestamp,rurality,s1 score,s2 score,s3 score,uw gamsat,w gamsat,deakin bonus,anu bonus,mq bonus,...,year,uq tier,casper quartile,outlier,deakin tier,unimelb gam,undf bonuses,unds bonuses,location,uow bonuses
0,8/30/2022 18:18:49,Non-Rural,68,88,78,78.0,78.0,0,0,0,...,2022,,,,,,,,,
1,8/30/2022 18:53:15,Non-Rural,67,77,78,74.0,75.0,2,4,0,...,2022,,,,,,,,,
2,8/30/2022 18:53:52,Non-Rural,67,78,93,79.333333,82.75,0,0,0,...,2022,,,,,,,,,
3,8/30/2022 20:09:20,Non-Rural,53,76,84,71.0,74.25,0,0,0,...,2022,,,,,,,,,


In [18]:
interview_raw.to_csv(f"{RELATIVE_OUT}/interview.csv")