In [282]:
import pandas as pd
import numpy as np
import csv

### Functions

In [348]:
def has_date(line_str):
    return (len(line_str) < 5 or line_str[:6].count("/") == 2)

In [349]:
def get_split(line_str, header_2024=False):
    line_split = line_str.split("\t")

    if (header_2024):
        # split up
        line_split = [x.split(" + ") for x in line_split]

        # flatten
        line_split = [item for sublist in line_split for item in sublist]

    return line_split

In [387]:
OUTLIER_INDEX = 13
OUTLIER_FLAG = "Yes- I"

def remove_error(line_split, year_use, offers):
    # if end line is anywhere in the middle
    if ("\n" in line_split[:-1]):
        line_split.remove("\n")

        # insert the missing attribute if not present
        if (year_use == 2023 and line_split[OUTLIER_INDEX] != OUTLIER_FLAG):
            line_split.insert(OUTLIER_INDEX, OUTLIER_FLAG)

    return line_split

In [388]:
def get_lines(path, year_use, diagnose=False, offers=False):
    # the final array
    out_lines = []
    header = []

    with open(path) as fp:
        # get all the lines
        lines = fp.readlines()

        # get the header
        header = get_split(lines[0], header_2024=year_use==2024)

        prev_line = get_split(lines[1])
        for i in range(2, len(lines)):
            # add to the current line to previous line
            if (not has_date(lines[i])):
                prev_line.extend(get_split(lines[i]))
            else:
                if (diagnose):
                    prev_line = remove_error(prev_line, year_use, offers)

                    print(prev_line)
                    print(i-1, "flag", len(prev_line))
                    if (len(prev_line) != 27):
                        print("bingo")
                    #print(lines[i])

                    out_lines.append(prev_line)
                else:
                    out_lines.append(remove_error(prev_line, year_use, offers))
                
                prev_line = get_split(lines[i])

        # add the final line end of the file
        out_lines.append(remove_error(prev_line, year_use, offers))

    return header, out_lines

### Exection

In [403]:
RELATIVE_IN = "1. landing/1. text"
RELATIVE_OUT = "1. landing/2. csv"

In [322]:
START_YEAR = 2022
END_YEAR = 2024

for year in range(START_YEAR, END_YEAR+1):
    # get the lines and header
    header, separated = get_lines(f"{RELATIVE_IN}/interview {year}.txt", year, offers=False)

    # get the dataframe
    interview_frame = pd.DataFrame(columns=header, data=separated)

    # write the frame
    interview_frame.to_csv(f"{RELATIVE_OUT}/interview {year}.csv")

In [389]:
START_YEAR = 2022
END_YEAR = 2023

for year in range(START_YEAR, END_YEAR+1):
    print(f"start {year}")

    # get the lines and header
    header, separated = get_lines(f"{RELATIVE_IN}/offers {year}.txt", year, offers=True, diagnose=False)

    # get the dataframe
    interview_frame = pd.DataFrame(columns=header, data=separated)

    # write the frame
    interview_frame.to_csv(f"{RELATIVE_OUT}/offers {year}.csv")

start 2022
start 2023


### Verifying

In [401]:
df = pd.read_csv(f"{RELATIVE_OUT}/offers 2023.csv", index_col=0)
df.columns

Index(['Timestamp', 'Rurality', 'MMI Prep (hrs)', 'MMI Opinion',
       'Offering Uni', 'Place Type', 'GPA', 'GAMSAT', 'Preference',
       'Interviewing Uni', 'GPA.1', 'GAMSAT.1', 'Preference.1',
       'Other Rejections', 'Places?', 'GEMSAS vs Other?', 'Status', 'Notes',
       'UQ MP/RMP Tier', 'CASPer Quartile', 'Deakin Bonus', 'ANU Bonus',
       'MQ Bonus (GPA)', 'UW GAMSAT', 'W GAMSAT', 'S1 Score', 'S2 Score',
       'S3 Score\n'],
      dtype='object')

In [402]:
df[(df["Notes"] != "")]

Unnamed: 0,Timestamp,Rurality,MMI Prep (hrs),MMI Opinion,Offering Uni,Place Type,GPA,GAMSAT,Preference,Interviewing Uni,...,UQ MP/RMP Tier,CASPer Quartile,Deakin Bonus,ANU Bonus,MQ Bonus (GPA),UW GAMSAT,W GAMSAT,S1 Score,S2 Score,S3 Score\n
0,30/10/2023 11:45:01,Non-Rural,0-5,Poorly,,,,,,Griffith University,...,,2nd,4,0,0,72.33,71.00,79,71,67
1,30/10/2023 11:45:09,Non-Rural,250+,Very well,,,,,,Macquarie University,...,,3rd,0,0,0,71.33,73.00,67,69,78
2,30/10/2023 11:46:59,Non-Rural,11-25,Very well,Macquarie University,FFP,6.646,69.25,3.0,,...,,,0,0,3,69.33,69.25,65,74,69
3,30/10/2023 11:47:22,Non-Rural,11-25,Poorly,Griffith University,CSP,7.000,70.00,2.0,The University of Queensland (Greater Brisbane),...,,4th,0,0,0,70.00,70.00,65,75,70
4,30/10/2023 11:47:43,Non-Rural,11-25,Unsure,,,,,,Deakin University,...,,4th,12,4,0,63.00,61.25,56,77,56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,24/05/2024 11:14:44,Non-Rural,26-50,Well,,,,,,Macquarie University,...,,,0,0,0,67.67,68.00,63,71,69
505,27/06/2024 22:32:25,Non-Rural,51-100,Poorly,,,,,,The University of Notre Dame Fremantle,...,,4th,6,0,0,62.00,61.50,52,74,60
506,12/07/2024 14:04:45,Non-Rural,26-50,Very poorly,,,,,,The University of Wollongong,...,Tier 1 (CQ-WB RMP),2nd,4,0,0,71.00,72.00,68,70,75
507,25/07/2024 19:35:25,Non-Rural,101-250,Well,The University of Melbourne,CSP,6.875,72.67,1.0,,...,,,0,0,0,72.67,73.25,62,81,75
