In [330]:
import pandas as pd
import numpy as np

In [345]:
RELATIVE_IN = "2. raw"
RELATIVE_OUT = "3. curated"

## Offers

In [362]:
offers_raw = pd.read_csv(f"{RELATIVE_IN}/offers.csv", index_col=0)
print(offers_raw.shape)
#print(offers_raw.dtypes)

(952, 31)


### Fixing Values

In [363]:
offers_raw["offer uni"].value_counts()

offer uni
The University of Melbourne                        163
Deakin University                                  104
Griffith University                                 84
The University of Western Australia                 61
The University of Notre Dame Sydney                 54
The University of Notre Dame Fremantle              51
Australian National University                      32
The University of Queensland (CQ-WB RMP)            30
The University of Queensland                        29
The University of Wollongong                        27
Macquarie University                                27
The University of Queensland (Greater Brisbane)     14
The University of Queensland (DD MP)                12
Name: count, dtype: int64

In [364]:
UQ_NAME = "The University of Queensland"
RMP_ENDINGS = ["(DD MP)", "(CQ-WB RMP)"]
METRO_ENDING = "(Greater Brisbane)"

# fixing the queensland column
offers = offers_raw.copy()

# get the RMP types
offers.loc[offers["uq type"] == "RMP", "offer uni"] = UQ_NAME + " (RMP)"

# change the MD and WB to correct types
offers.loc[offers["offer uni"].isin([f"{UQ_NAME} {x}" for x in RMP_ENDINGS ]), "offer uni"] = UQ_NAME + " (RMP)"
offers.loc[offers["interview uni"].isin([f"{UQ_NAME} {x}" for x in RMP_ENDINGS ]), "interview uni"] = UQ_NAME + " (RMP)"

# change greater brisbance
metro_mask_offer = (offers["offer uni"] == UQ_NAME) | (offers["offer uni"] == f"{UQ_NAME} {METRO_ENDING}")
metro_mask_interview = (offers["interview uni"] == UQ_NAME) | (offers["interview uni"] == f"{UQ_NAME} {METRO_ENDING}")
offers.loc[metro_mask_offer, "offer uni"] = f"{UQ_NAME} (Metro)"
offers.loc[metro_mask_interview, "interview uni"] = f"{UQ_NAME} (Metro)"

# dropping the type
offers.drop(columns="uq type", inplace=True)

# fixing the interview column
offers.loc[(offers["year"] == 2023) & (offers["interview uni"].isna()), "interviewed?"] = "Yes"
offers.loc[(offers["year"] == 2023) & (offers["interview uni"].notna() & (offers["offer uni"].notna())), "interviewed?"] = "No"

# make the notes lowe case
offers.loc[:, "notes"] = offers["notes"].str.lower()

# drop un necessary columns
offers.drop(columns=["status", "timestamp"], inplace=True)

In [365]:
offers["offer uni"].value_counts()

offer uni
The University of Melbourne               163
Deakin University                         104
Griffith University                        84
The University of Queensland (RMP)         61
The University of Western Australia        61
The University of Notre Dame Sydney        54
The University of Notre Dame Fremantle     51
Australian National University             32
The University of Wollongong               27
Macquarie University                       27
The University of Queensland (Metro)       24
Name: count, dtype: int64

### Getting the marker type column

In [366]:
print(offers[(offers["offer uni place type"].isna()) & (offers["offer uni"].isna())].shape)  # offer uni and offer uni type notna
print(offers[(offers["offer uni"].isna())].shape)                                            # offer uni notna
print(offers[(offers["offer uni place type"].isna())].shape)                                 # offer uni type notna

print(offers[(offers["offer uni"].isna()) & (offers["places selected"].isna())].shape)       # offer uni na with places selected also na

(264, 28)
(264, 28)
(264, 28)
(0, 28)


In [367]:
offers["places selected"].value_counts()

places selected
CSP, BMP          327
CSP, BMP, FFP     119
All                94
CSP & BMP Only     88
CSP                47
CSP Only           29
FFP                 5
BMP                 5
CSP, FFP            5
BMP, FFP            1
Name: count, dtype: int64

In [368]:
# fixing the places selected column with dictionary
CSP = "CSP"
BMP = "BMP"
FFP = "FFP"

rename_places_selected = {
    "CSP, BMP": [CSP, BMP],
    "CSP & BMP Only": [CSP, BMP],

    "CSP, BMP, FFP": [CSP, BMP, FFP],
    "All": [CSP, BMP, FFP],

    "CSP": [CSP],
    "CSP Only": [CSP],
    "CSP, FFP": [CSP, FFP],
    
    "BMP": [BMP],
    "BMP, FFP": [BMP, FFP],

    "FFP": [FFP],
}

offers["places selected"] = offers["places selected"].apply(lambda x: rename_places_selected[x] 
                                                            if (not pd.isna(x)) and (rename_places_selected.get(x)) else x)

offers["places selected"].value_counts()

places selected
[CSP, BMP]         415
[CSP, BMP, FFP]    213
[CSP]               76
[FFP]                5
[BMP]                5
[CSP, FFP]           5
[BMP, FFP]           1
Name: count, dtype: int64

In [369]:
offers['offer uni place type'].value_counts()

offer uni place type
CSP    491
BMP    131
FFP     66
Name: count, dtype: int64

In [370]:
# checks how many times someone gets rejected and the places they selected aren't recorded
offers[offers["places selected"].isna()]["interview uni"].notna().sum()

0

In [371]:
offers["marker"] = offers["offer uni place type"]
offers.loc[offers["marker"].isna(), "marker"] = offers["places selected"]
#offers.loc[offers["marker"].isna(), "marker"] = "Unknown"
offers["marker"].isna().sum()

#offers["marker"].value_counts()

0

### Convert to String

In [372]:
STRING_COLUMNS = [
    'rurality', 'offer uni preference', 'interview uni preference', 'deakin bonus', 'anu bonus', 'mq bonus',
                  
    # already strings
    'interviewed?', 'casper quartile', 'uq rmp tier', 'interview opinion', 'interview opinion', 'interview prep hours'
]

FLOAT_TO_INT_COLUMNS = ['offer uni preference', 'interview uni preference']

for string_col in STRING_COLUMNS:
    # convert into a string and impute
    offers.loc[:, string_col] = offers[string_col].astype(str)
    offers.loc[:, string_col] = offers.replace('nan', 'None')

    # convert to float if necessary
    if (string_col in FLOAT_TO_INT_COLUMNS):
        offers.loc[:, string_col] = offers[string_col].apply(lambda x: x.split('.')[0])


 '1.0' '1.0' '3.0' '1.0' 'nan' '4.0' '3.0' '1.0' '1.0' '5.0' 'nan' 'nan'
 'nan' 'nan' 'nan' 'nan' '1.0' 'nan' 'nan' 'nan' '5.0' '2.0' 'nan' '1.0'
 '1.0' '1.0' 'nan' 'nan' '1.0' '1.0' 'nan' '1.0' '4.0' 'nan' 'nan' 'nan'
 'nan' '1.0' 'nan' 'nan' 'nan' '1.0' '1.0' 'nan' '1.0' '4.0' '1.0' '1.0'
 'nan' '1.0' '1.0' '2.0' '1.0' '1.0' 'nan' 'nan' 'nan' 'nan' '2.0' '1.0'
 'nan' '1.0' 'nan' '2.0' 'nan' 'nan' '2.0' '2.0' '1.0' 'nan' '1.0' '1.0'
 '2.0' 'nan' 'nan' 'nan' '1.0' '2.0' 'nan' '1.0' 'nan' 'nan' '1.0' '1.0'
 '1.0' 'nan' '1.0' '1.0' 'nan' '1.0' '1.0' '1.0' 'nan' '1.0' '1.0' 'nan'
 '1.0' '2.0' '2.0' '2.0' 'nan' '1.0' '1.0' '1.0' '1.0' '2.0' '1.0' '3.0'
 '1.0' '2.0' 'nan' '1.0' '1.0' '1.0' 'nan' '1.0' 'nan' '1.0' 'nan' 'nan'
 'nan' '2.0' '6.0' '2.0' '1.0' 'nan' 'nan' 'nan' '1.0' '1.0' '2.0' '2.0'
 '1.0' '2.0' '1.0' '1.0' '3.0' '1.0' '1.0' '1.0' 'nan' '2.0' '2.0' '1.0'
 'nan' '1.0' 'nan' '1.0' '1.0' '1.0' '2.0' '1.0' '2.0' '1.0' 'nan' '1.0'
 '3.0' 'nan' '1.0' '1.0' '2.0' '2.0' '3.0' '1.0' '1

In [373]:
offers['offer uni preference']

0         4
1         1
2      None
3      None
4      None
       ... 
947    None
948    None
949    None
950       1
951       1
Name: offer uni preference, Length: 952, dtype: object

### Save CSV

In [374]:
# save with the index
offers = offers.reset_index()
offers.to_csv(f"{RELATIVE_OUT}/offer.csv", index=False)

### Messing around

In [375]:
(offers["interview uni"] == UQ_NAME).sum()

0

In [376]:
def uq_mask(series):
    return series.apply(lambda x: UQ_NAME in x if not pd.isna(x) else False)

In [377]:
offers.apply(lambda x: x.count())

index                       952
rurality                    952
s1 score                    952
s2 score                    952
s3 score                    952
uw gamsat                   952
w gamsat                    952
offer uni                   688
offer uni gpa               688
offer uni gamsat            688
offer uni place type        688
offer uni preference        952
interviewed?                952
interview uni               385
interview uni gpa           385
interview uni gamsat        385
places selected             720
interview uni preference    952
deakin bonus                952
anu bonus                   952
mq bonus                    952
casper quartile             952
uq rmp tier                 952
gemsas over other?          688
interview opinion           952
notes                       567
year                        952
interview prep hours        952
other rejections            142
marker                      952
dtype: int64

## Interviews

In [267]:
interview_raw = pd.read_csv(f"{RELATIVE_IN}/interview.csv", index_col=0)
print(interview_raw.shape)
#print(interview_raw.dtypes)

(1533, 44)


### Fixing Numeric

In [268]:
interview_raw[[col for col in interview_raw.columns if "gamsat" in col or "gpa" in col]].dtypes.to_frame().T

Unnamed: 0,uw gamsat,w gamsat,interview uni gpa,interview uni gamsat,pref 1 gpa,pref 1 gamsat,pref 2 gpa,pref 2 gamsat,pref 3 gpa,pref 3 gamsat,pref 4 gpa,pref 4 gamsat,pref 5 gpa,pref 5 gamsat,pref 6 gpa,pref 6 gamsat
0,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object


In [269]:
print(interview_raw.shape[0])
print(interview_raw["pref 6 gamsat"].apply(lambda x: isinstance(x, str)).sum())
print(interview_raw["pref 6 gamsat"].apply(lambda x: isinstance(x, int)).sum())
print(interview_raw["pref 6 gamsat"].count())

1533
568
0
568


In [270]:
interview_raw["pref 6 gamsat"][interview_raw["pref 6 gamsat"].notna()].sample(10).to_frame().T

Unnamed: 0,669,879,508,49,878,1461,557,782,924,664
pref 6 gamsat,\n,\n,\n,67.75,\n,65.25,\n,\n,\n,\n


In [271]:
interview_raw["pref 6 gamsat"] = interview_raw["pref 6 gamsat"].apply(
    lambda x: x.strip() if isinstance(x, str) else x
)
interview_raw["pref 6 gamsat"] = pd.to_numeric(interview_raw["pref 6 gamsat"])

In [272]:
print(interview_raw["pref 6 gamsat"].count())
print(interview_raw["pref 6 gamsat"].unique())

72
[        nan 64.         63.66666667 62.66666667 66.5        67.75
 63.33333333 66.         68.75       69.         63.5        68.25
 71.25       69.5        70.         65.66666667 71.         63.
 65.75       63.25       67.         65.         61.         69.75
 56.75       61.75       68.         58.25       71.75       66.67
 62.         64.67       62.33       65.33       70.33       64.25
 62.67       68.5        62.75       65.67       62.25       59.
 65.25       61.67       69.25       67.25       52.67       58.        ]


### Switching to Strings

In [273]:
STRING_COLUMNS = [
    # numeric
    'deakin bonus', 'anu bonus', 'mq bonus', 'undf bonuses', 'unds bonuses', 'uow bonuses',
    
    # other stuff
    'uq tier', 'casper quartile', 'deakin tier', 'unimelb gam'
]

for string_col in STRING_COLUMNS:
    interview_raw.loc[:, string_col] = interview_raw[string_col].apply(str)
    interview_raw.loc[:, string_col] = interview_raw.replace('nan', 'None')


  interview_raw.loc[:, string_col] = interview_raw[string_col].apply(str)
  interview_raw.loc[:, string_col] = interview_raw[string_col].apply(str)
  interview_raw.loc[:, string_col] = interview_raw[string_col].apply(str)
  interview_raw.loc[:, string_col] = interview_raw[string_col].apply(str)


In [274]:
interview_raw[STRING_COLUMNS[3]].value_counts()

undf bonuses
None      1508
WA Res      24
HDR          1
Name: count, dtype: int64

### Changing UQ

In [275]:
UNI_COLUMNS = ["interview uni"] + [f"pref {i+1} uni" for i in range(6)]
UQ_NAME = "The University of Queensland"
RMP_ENDINGS = ["(DD MP)", "(CQ-WB RMP)", "(RMP/DDMP)"]
METRO_ENDING = "(Greater Brisbane)"

# fixing the queensland column
interview = interview_raw.copy()

# change the MD and WB to correct types
for uni_column in UNI_COLUMNS:
    interview.loc[interview[uni_column].isin([f"{UQ_NAME} {x}" for x in RMP_ENDINGS ]), uni_column] = f"{UQ_NAME} (RMP)"
    interview.loc[interview[uni_column] == f"{UQ_NAME} {METRO_ENDING}", uni_column] = f"{UQ_NAME} (Metro)"

# make the notes lowe case
interview.loc[:, "notes"] = interview["notes"].str.lower()

In [276]:
#interview.groupby("interview uni")["year"].value_counts()
interview["interview uni"].value_counts()

interview uni
The University of Melbourne               300
The University of Notre Dame Sydney       198
Deakin University                         126
Griffith University                       110
The University of Western Australia       108
The University of Notre Dame Fremantle    101
The University of Wollongong               93
The University of Queensland (RMP)         83
The University of Queensland               74
Australian National University             62
The University of Queensland (Metro)       60
Macquarie University                       52
Name: count, dtype: int64

### Saving

In [277]:
# save with the index
interview = interview.reset_index()
interview.to_csv(f"{RELATIVE_OUT}/interview.csv", index=False)