In [None]:
from pypdf import PdfReader
import pandas as pd

In [None]:
reader = PdfReader("cutoff24.pdf")

In [None]:
for i in range(15):
    page_text = reader.pages[i].extract_text().split("\n")
    print(page_text[11], end='\n\n')

In [None]:
for line, text in enumerate(reader.pages[1322].extract_text().split('\n')):
    print(line, text)

# lines number 7 is college name all the time
# line 7+1 is first course name
# line 7+2 is course status and uni
# line 7+3 is reservation type 
# line 7+4 is category labels, there are n labels
# line 7+4+1*n and 7+4+2*n are ranks and percentiles (first one has the stage number too)

In [None]:
df = pd.read_csv("Maharashtra-cut-off.csv")
lines = df[df.columns[0]].to_list()
for line in lines[200:250]:
    print(line)


In [None]:
import re
from itertools import accumulate
from rich.progress import track

regional_reservation_types = [
    "Home University Seats Allotted to Home University Candidates",
    "Home University Seats Allotted to Other Than Home University Candidates",
    "Other Than Home University Seats Allotted to Home University Candidates"
    "Other Than Home University Seats Allotted to Other Than Home University Candidates",
    "State Level"
]

rows = []
row = {}

f = open("rawranklines.txt", 'w')

def extract_table_data(category_line, rank_line):


    word_begin = 0
    word_end = 0
    i = 0
    word=""
    while i < len(category_line):
        if category_line[i].isnumeric():
            word_begin = i
            while category_line[i] != " ":
                word_end = i
                i+=1
            word_len = word_end - word_begin
            search_index = word_begin + word_len/2

            rank_parts = re.findall(r'\s+|\S+', rank_line)
            parts_indices = list(accumulate(map(len, parts)))
            word = next((filter(lambda x: x>=search_index, parts_indices)),None)
    i+=1
        
    yield (category_line[word_begin:word_end], word)

def parse_lines(category_line, rank_line, percentile_line):
    results = []
    i = 0
    while i < len(category_line):
        if category_line[i].isspace():
            i += 1
            continue

        start = i
        while i < len(category_line) and not category_line[i].isspace():
            i += 1
        end = i

        mid = (start + end) // 2
        cat = category_line[start:end].strip()

        rank_word = extract_word_at(rank_line, mid)
        perc_word = extract_word_at(percentile_line, mid)

        if rank_word and perc_word:
            results.append((cat, rank_word, perc_word))
    return results

def extract_word_at(line, index):
    if index >= len(line):
        return None
        
    start = index
    while start > 0 and not line[start - 1].isspace():
        start -= 1
        
    end = index
    while end < len(line) and not line[end].isspace():
        end += 1
    word = line[start:end].strip()
    return word if word else None

category_line = ""
rank_line = ""
percentile_line = ""

for i, line in enumerate(lines):
    if type(line) is not str:
        line = str(line)

    college_match = re.match(r'^(\d{5})\s-\s(.+)', str(line))
    course_match = re.match(r'^(\d{10})\s-\s(.+)', str(line))
    status_match = re.match(r'^Status: (.+?) Home University : (.+)$', str(line))
    
    if line == "nan":
        continue
    
    elif college_match:
        row['college_code'] = college_match.group(1)
        row['college_name'] = college_match.group(2)

    elif course_match:
        row['course_code'] = course_match.group(1)
        row['course_name'] = course_match.group(2)
    
    elif status_match:
        row['course_status'] = status_match.group(1)
        row['course_university'] = status_match.group(2)

    elif line in regional_reservation_types:
        row['regional_reservation'] = line
    
    elif line.startswith("Stage"):
        # row['category_reservation'] = line.split()[1:]
        category_line = line[5:]
        
    elif line.startswith("I"): # also works for II and III
        rank_line = line
        
        for _ in range(3):
            rank_line = rank_line.removeprefix("I")
        
        row['last_cap_round'] = len(line.split()[0]) #easy hack
        # print(rank_line)
        percentile_line = lines[i+1]
        
        f.write(category_line+"\n")
        f.write(rank_line+"\n")
        f.write(percentile_line+"\n")
        
        for category_reservation, last_rank, cutoff_percentile in parse_lines(category_line, rank_line,percentile_line):
            row['category_reservation'] = category_reservation
            row['last_rank'] = last_rank
            row['cutoff_percentile'] = cutoff_percentile
            rows.append(row.copy())

f.close()

df = pd.DataFrame(rows)
df.

In [None]:

test = """
      GOPENS          GSCS           GNT1S          GNT2S          GOBCS          GSEBCS              TFWS              EWS
      32812          57115          42283          50806           41986          75468              31644            49031
(88.9034341)   (80.0921435)   (85.4541675)   (82.4751693)    (85.5052176)   (72.8411223)       (89.3071786)     (83.0327460)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT2S              GNT3S           GOBCS          GSEBCS         LOPENS           LSCS           LSTS           LVJS          LNT2S          LOBCS         LSEBCS       PWDOPENS       PWDOBCS        DEFOPENS        DEFOBCS          TFWS        PWDROBC        DEFRSEBC         ORPHAN           EWS
      4466           17298          38171          18780           10064          8337               4763             5749           6563           5539           20572          19476          14391          17802          8953           8596           6409           29879          10938          24353          4428           26196          22982          49714          7810
(98.4718166)   (94.2401197)   (86.9353148)   (93.7520868)    (96.6700602)   (97.2156659)       (98.3761174)     (98.0509496)   (97.7963272)   (98.1156916)   (93.1559091)   (93.5225248)   (95.2450348)   (94.0878795)   (97.0354332)   (97.1452421)   (97.8540959)   (89.9248303)   (96.3844907)   (91.8929842)   (98.4844069)   (91.2510363)   (92.3221052)   (82.7820291)   (97.4012008)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT2S              GNT3S           GOBCS          GSEBCS         LOPENS           LSCS           LSTS          LNT1S          LNT2S          LNT3S          LOBCS         LSEBCS       PWDOPENS         PWDSCS       PWDOBCS        PWDSEBCS       DEFOPENS         DEFSCS        DEFOBCS       DEFSEBCS          TFWS
      96             1765           8970           1783            1497           451                843              291            408            168            3320           6240           1166           553            491            452            273            3651           48060          5624           10377          544            9740           1112           1135           52
(99.9666110)   (99.3417263)   (97.0354332)   (99.3392345)    (99.4300008)   (99.8028497)       (99.6596408)     (99.8781231)   (99.8260796)   (99.9376662)   (98.8377026)   (97.8889628)   (99.5366194)   (99.7630430)   (99.7887086)   (99.8028497)   (99.8889163)   (98.7367021)   (83.5369475)   (98.0953593)   (96.5855308)   (99.7670042)   (96.7696160)   (99.5592791)   (99.5515511)   (99.9833770)
      GOPENS          GSCS            GSTS           GVJS           GNT2S          GNT3S             GOBCS            GSEBCS         LOPENS           LSCS           LSTS           LVJS          LNT1S          LOBCS         LSEBCS       PWDOPENS       PWDOBCS        DEFOPENS        DEFOBCS          TFWS        PWDROBC        DEFROBCS         ORPHAN           EWS
      476            4583           12972          2176            1382           2260               522              1095           605            7588           16110          2522           2832           1040           2778           11817          26257          1220           4349           533            25757          2365           20553          1531
(99.7949643)   (98.4469443)   (95.7030200)   (99.2152371)    (99.4697548)   (99.1801893)       (99.7755891)     (99.5636454)   (99.7457558)   (97.4656463)   (94.6546232)   (99.1017461)   (99.0007078)   (99.5899286)   (99.0196078)   (96.1033294)   (91.2437024)   (99.5260861)   (98.5039272)   (99.7709341)   (91.3560438)   (99.1391853)   (93.1559091)   (99.4188513)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT2S             GOBCS            GSEBCS         LOPENS           LSCS           LSTS           LVJS          LNT2S          LNT3S          LOBCS         LSEBCS       PWDOPENS       PWDOBCS        DEFOPENS        DEFOBCS          TFWS        PWDROBC        DEFRSEBC         ORPHAN           EWS
      1341           6653           25131          4988            2303           2713               1745             2610           2051           9023           33070          11023          4105           12405          2795           2397           6994           34161          1977           10337          1861           22809          6059           30356          3327
(99.4863759)   (97.7676159)   (91.6275203)   (98.2928759)    (99.1771579)   (99.0370747)       (99.3451981)     (99.0691489)   (99.2524046)   (97.0110164)   (88.7794539)   (96.3627546)   (98.5758959)   (95.8848080)   (99.0119265)   (99.1347494)   (97.6849731)   (88.5013511)   (99.2821369)   (96.5964131)   (99.3129866)   (92.4262209)   (97.9679189)   (89.7974248)   (98.8358603)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT2S              GNT3S           GOBCS          GSEBCS         LOPENS           LSCS           LSTS           LVJS          LNT1S          LNT2S          LOBCS         LSEBCS       PWDOPENS        PWDSCS        PWDOBCS        PWDSEBCS       DEFOPENS         DEFSCS        DEFOBCS       DEFSEBCS          TFWS
      635            5112           18607          3351            1993           2417               1337             865            1286           1118           7195           20205          8284           4829           2398           1540           2177           9391           49509          19828          39857          1862           18013          2730           2491           582
(99.7311587)   (98.2629654)   (93.8492553)   (98.8315642)    (99.2685866)   (99.1293249)       (99.4863759)     (99.6479578)   (99.5013298)   (99.5592791)   (97.5983511)   (93.2798858)   (97.2402198)   (98.3505694)   (99.1347494)   (99.4180407)   (99.2152371)   (96.8957779)   (82.8590175)   (93.4044269)   (86.2657912)   (99.3129866)   (94.0242686)   (99.0345870)   (99.1068447)   (99.7548103)
      GOPENS          GSCS            GSTS          GNT1S          GOBCS          GSEBCS             LOPENS             LSCS          LNT3S          LOBCS         LSEBCS       PWDOPENS       DEFOPENS          TFWS         PWDRSTS       DEFROBCS           EWS
      1521           10641          27909          4101            2622           2668               1951             10478          8516           2339           6826           44713          3038           1825           47440          15532          4028
(99.4212433)   (96.4996882)   (90.6140311)   (98.5847116)    (99.0691103)   (99.0427132)       (99.2906313)     (96.5413029)   (97.1782833)   (99.1563130)   (97.7104553)   (84.6087386)   (98.9309246)   (99.3279194)   (83.6467428)   (94.8262965)   (98.6114738)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT2S              GNT3S           GOBCS          GSEBCS         LOPENS           LSCS           LSTS           LVJS          LNT1S          LNT2S          LOBCS         LSEBCS       PWDOPENS        PWDSCS        PWDOBCS        PWDSEBCS       DEFOPENS         DEFSCS        DEFOBCS       DEFSEBCS          TFWS
      2109           12115          33571          6916            8063           4344               4237             3310           3552           3043           18845          60872          11643          11152          9462           5220           4914           36640          159543         53563          43997          6146           34027          13425          19372          2207
(99.2311848)   (96.0030715)   (88.6325220)   (97.6997985)    (97.3175348)   (98.5039272)       (98.5392956)     (98.8401580)   (98.7615842)   (98.9308654)   (93.7458444)   (78.9120215)   (96.1675401)   (96.3424669)   (96.8831816)   (98.2312987)   (98.3307978)   (87.5036363)   (22.4081803)   (81.4024523)   (84.8482325)   (97.9251183)   (88.5172385)   (95.5544287)   (93.5379106)   (99.1976720)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT2S             GOBCS            GSEBCS         LOPENS           LSCS           LSTS          LNT2S          LNT3S          LOBCS         LSEBCS       PWDOPENS       PWDOBCS        DEFOPENS        DEFOBCS          TFWS        PWDRSEBC       DEFRSEBC           EWS
      3877           17211          48924          15181           10483          8776               5662             5385           4712           23574          70365          17456          22357          7448           11307          58069          103371         8459           25023          6371           58458          24999          11142
(98.6602671)   (94.3030050)   (83.1469549)   (94.9943414)    (96.5413029)   (97.0696299)       (98.0867461)     (98.1833299)   (98.3958075)   (92.1947909)   (75.0114283)   (94.2061358)   (92.5598125)   (97.5292884)   (96.2601420)   (79.7465759)   (60.2294112)   (97.1866689)   (91.6309281)   (97.8590730)   (79.5290328)   (91.6345444)   (96.3424669)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT2S              GNT3S           GOBCS          GSEBCS         LOPENS           LSCS           LSTS           LVJS          LNT1S          LOBCS         LSEBCS       PWDOPENS       PWDOBCS        DEFOPENS        DEFOBCS          TFWS        PWDROBC        DEFROBCS           EWS
      6313           22428          72182          19759           11063          11049              9891             7096           8674           8293           26422          54883          21408          14301          9570           15326          60018          153215         16037          27551          7177           104864         22957          18989
(97.8806679)   (92.5368222)   (74.2767378)   (93.4175138)    (96.3500312)   (96.3500312)       (96.7466861)     (97.6386926)   (97.1295005)   (97.2230306)   (91.2367752)   (80.9741447)   (92.8871646)   (95.2691748)   (96.8268477)   (94.8964961)   (79.0646435)   (27.9969458)   (94.6753610)   (90.7979989)   (97.6037124)   (59.9791975)   (92.3354600)   (93.7092583)
      GOPENH
      174944
(6.3063439)
      GOPENO
      72526
(74.1162962)
       TFWS
      75117
(73.1122470)
      GOPENH          GSCH           GNT1H         GOBCH           GSEBCH         LOPENH               LSCH             LVJH          LNT2H         LOBCH         LSEBCH       PWDOPENH
      135092         167692         144642         136909          168300         113928             140620           163960         173666        146193         150654         91029
(41.8427081)   (14.7377685)   (34.8052940)   (40.1585977)    (13.9490359)   (54.1800379)       (37.6881791)     (18.5574725)   (7.9711444)   (33.5585964)   (30.3265615)   (65.8812284)
      GOBCH
      165794
(16.3513698)
     GOPENO           GNT2O         GOBCO          LOPENO          LOBCO
      104402         179518         112444         124930          144442
(60.0743746)   (0.1786902)    (55.1151268)   (47.9842799)    (34.8626522)
      LSEBCO
      162234
(20.1344161)
       TFWS
      133001
(42.8773305)
      GOPENH          GNT2H         GOBCH          GSEBCH             LOPENH           LSCH              LOBCH           LSEBCH
      134616         161377         158097         177777             88286          149722             143825           110316
(42.1597155)   (21.0368242)   (24.0053974)   (2.7561837)        (67.5033807)   (30.9541920)       (35.3791065)     (56.8466110)
      GOPENO         LOPENO
      142923         155597
(36.0381755)   (25.6745674)
      GOBCO          GSEBCO
      175879         168540
(5.2422847)    (13.6621028)
       TFWS
      107729
(57.4142590)
      GOPENH         LOPENH
      177137         175127
(3.7972607)    (6.2399253)
      GOPENO
      173689
(7.8773225)
       TFWS
      150983
(29.6395567)
      GOPENH
      173784
(7.8389114)
      GOPENO
      144593
(34.8052940)
       TFWS
      133721
(42.2216419)
      GOPENS          GSCS            GVJS          GNT1S           GNT2S          GNT3S             GOBCS            GSEBCS         LOPENS           LSCS           LVJS          LNT1S          LNT2S          LOBCS         LSEBCS       DEFOPENS          TFWS            EWS
      80985          156820         143452         108042          98186          120562             119628           93242          75654          148556         135647         100568         77566          100891         80250          155115         59226          161163
(70.5215521)   (24.7657909)   (35.5925864)   (57.3310702)    (62.8351694)   (50.8618056)       (51.1396363)     (65.2878345)   (72.8176141)   (31.6234334)   (41.0394031)   (61.3261708)   (72.1239122)   (60.8485656)   (70.8722936)   (26.1271353)   (79.4055287)   (21.3531102)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT3S             GOBCS            GSEBCS         LOPENS           LSCS           LSTS          LNT2S          LOBCS         LSEBCS          TFWS            EWS
      110906         155018         165287         157793          114339         123986             140889           169287         95850                         105385                        125447         165450         65126          140286
(56.5649954)   (26.1271353)   (16.8882979)   (24.0471793)    (54.0859766)   (48.6637821)       (37.6014199)     (13.0991478)   (63.5258359)                  (59.3445914)                  (47.6302148)   (16.7391126)   (76.8897938)   (38.4109612)
      GOPENS          GSCS            GSTS           GVJS           GNT1S          GNT3S             GOBCS            GSEBCS         LOPENS           LSCS           LSTS          LNT2S          LOBCS         LSEBCS          TFWS            EWS
                                                                                                                                                   164836                        97104
(17.7766170)                  (63.0759599)
      GOPENS          GNT2S         GSEBCS         LOPENS            LSCS           LVJS              LNT2S            LOBCS         LSEBCS          TFWS
      176563         176789         178867         140781          171740         151102             171707           148236         148098         92209
(4.3685242)    (4.0661895)    (1.1851062)    (37.6881791)    (10.3145909)   (29.6395567)       (10.4465918)     (31.6884901)   (32.0977394)   (65.5460439)
      GOPENS          GSCS            GVJS          GNT2S          GOBCS          GSEBCS             LOPENS            LOBCS         LSEBCS          TFWS            EWS
      111811         168499         131741         122226          159507         121620             104944           110323         119365         75287          133443
(55.3559436)   (13.7020033)   (43.8452614)   (49.7949643)    (22.4081803)   (50.0957604)       (59.9791975)     (56.8466110)   (51.2170385)   (73.1122470)   (42.7253460)
"""

def parse_all_blocks(multi_input: str):
    lines1 = [line for line in multi_input.strip().splitlines() if line.strip()]
    results = []
    i = 0
    while i + 2 < len(lines1):
        cat_line = lines1[i]
        rank_line = lines1[i + 1]
        perc_line = lines1[i + 2]
        parsed = parse_lines(cat_line, rank_line, perc_line)
        results.append(parsed)
        i += 3
    return results

parse_all_blocks(test)

