In [45]:
from collections import Counter
import numpy as np
import pandas as pd
import enum
from io import StringIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer

# Open a PDF file.
fp = open('plan5_aktualny.pdf', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)

info = []

def parse_obj(lt_objs, index):

    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            #print(obj)
            #print ("%6d, %6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.get_text().replace('\n', '_')))
            info.append((obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.get_text().replace('\n', '_'), index))
        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)

# loop over all pages in the document
for i, page in enumerate(PDFPage.create_pages(document)):

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    # extract text from this object
    parse_obj(layout._objs, i)

In [44]:
def get_hour_from_cord_tekst(val: str) -> str:
    if len(hour_definition[hour_definition.x <= val]):
        return hour_definition[hour_definition.x <= val].iloc[-1].Przedmiot
    return -1
def get_hour_from_cord_x(val: str) -> str:
    if len(hour_definition[hour_definition.x <= val]):
        return hour_definition[hour_definition.x <= val].iloc[-1].x
    return -1
def map_hour(hour: str, diff: int):
    if len(str(hour)) < 3:
        return -1
    hour = int(hour[:2])
    if diff > 38:
        return f'{hour + 1}:00'
    if diff < 13:
        return f'{hour}:15'
    if diff < 26:
        return f'{hour}:30'
    return f'{hour}:45'
def map_hour_duration(diff: int):
    if diff < 71:
        return "1h 30min"
    if diff < 105:
        return "2h 15min"
    return "2h 30min"
def lookup_week_day(row):
    strona = row.Strona
    word = df_week_day[(df_week_day.y > row.y) & (df_week_day.Strona == strona)].Przedmiot
    if len(word) > 0:
        word = word.values[-1]
    return "" if len(word) == 0 else word
def detect_lecture_type(text):
    return 'wykład' if '[w]' in text else 'ćwiczenia'
def map_group(y: float):
    if y > 280 and y < 350:
        return 1
    if y < 250:
        return 2
    if y > 410:
        return 1
    if y > 300 and y < 390:
        return 2
    return 0
# def map_under_text(row):
#     df_with_text = df_tmp[(
#         df_tmp.Przedmiot.str.contains('\[') == False) 
#         & (df_tmp.y < row.y) 
#         & (df_tmp.y > row.y - 30) 
#         & (df_tmp.x > row.x - hour_diff//2) 
#         & (df_tmp.x < row.x + hour_diff)
#     ].append(row).sort_index()
#     return ' '.join(df_with_text.Przedmiot.values).replace('_', '')
arr = np.array(info)
df = pd.DataFrame(arr)
df.columns = ['x', 'y', 'x_end', 'Przedmiot', 'Strona']
df.x = df.x.astype('float')
df.y = df.y.astype('float')
df.x_end = df.x_end.astype('float')
df['length_x'] = df.x_end - df.x
df

hour_definition = df[(df['Strona'] == '1') & (df['Przedmiot'].str.contains('00_'))]
hour_definition.drop_duplicates(subset=['Przedmiot'], inplace=True)
hour_definition.reset_index(drop=True, inplace=True)
hour_definition
hour = hour_definition.iloc[:2,:].x
hour_diff = int(hour[1] - hour[0])
hour_diff

df_week_day = df[['y', 'Przedmiot', 'Strona']]
df_week_day = df_week_day[(df_week_day.Przedmiot.str.contains('Sobota')) | (df_week_day.Przedmiot.str.contains('Niedziela'))]

#df_tmp = df[(df['Strona'] == '1') | (df['Strona'] == '2')]
df_tmp = df
df_tmp['hour_mapped'] = list(map(get_hour_from_cord_tekst, list(df_tmp.x)))
df_tmp['x_cord_mapped'] = list(map(get_hour_from_cord_x, list(df_tmp.x)))
df_tmp['hour_diff'] = df_tmp.x - df_tmp.x_cord_mapped
df_tmp['Godzina rozpoczęcia'] = df_tmp.apply(lambda x: map_hour(x['hour_mapped'], x['hour_diff']), axis=1)
#df_tmp.Przedmiot = df_tmp.apply(lambda x: map_under_text(x), axis=1)
df_tmp['Duration'] = df_tmp.length_x.apply(lambda x: map_hour_duration(x))
df_tmp['Week_day'] = df_tmp.apply(lambda x: lookup_week_day(x), axis=1)
#df_tmp
df_tmp['Rodzaj_zajecia'] = df_tmp['Przedmiot'].apply(detect_lecture_type)
df_tmp['Grupa'] = df_tmp['y'].apply(map_group)
df_tmp['Grupa'] = df_tmp['Grupa'].astype('int')
df_tmp[['Week_day', 'Date']] = df_tmp['Week_day'].str.split(' ', expand=True)
df_tmp = df_tmp[df_tmp.Przedmiot.str.contains('\[') == True][['Przedmiot', 'Strona', 'Godzina rozpoczęcia', 'Duration', 'Week_day', 'Rodzaj_zajecia', 'Grupa']]
df_tmp.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,Przedmiot,Strona,Godzina rozpoczęcia,Duration,Week_day,Rodzaj_zajecia,Grupa
57,"ME [ćw], dr hab. M.Hajduk, _sala D1/15_",1,8:00,2h 15min,Sobota,ćwiczenia,1
58,"WWW [ćw], dr inż. B.Nowak, _sala A3/17_",1,10:30,2h 30min,Sobota,ćwiczenia,1
59,"WWW [ćw], dr inż. B.Nowak, _sala A3/17_",1,8:00,2h 30min,Sobota,ćwiczenia,2
60,"ME [ćw], dr hab. M.Hajduk, _sala D1/15_",1,10:30,2h 15min,Sobota,ćwiczenia,2
61,"ME [w], dr hab. L.Błaszkiewicz, _prof. UWM, sa...",1,13:00,2h 30min,Sobota,wykład,0


Unnamed: 0,y,Przedmiot,Strona,Godzina rozpoczęcia,Duration,Week_day,length_x,Rodzaj_zajecia,Grupa,a,b,Date
57,436.0516,"ME [ćw], dr hab. M.Hajduk, _sala D1/15_",1,8:00,2h 15min,Sobota,98.61864,ćwiczenia,1,Sobota,08.10.2022_,08.10.2022_
58,436.0516,"WWW [ćw], dr inż. B.Nowak, _sala A3/17_",1,10:30,2h 30min,Sobota,105.43656,ćwiczenia,1,Sobota,08.10.2022_,08.10.2022_
59,382.5016,"WWW [ćw], dr inż. B.Nowak, _sala A3/17_",1,8:00,2h 30min,Sobota,105.43656,ćwiczenia,2,Sobota,08.10.2022_,08.10.2022_
60,382.5016,"ME [ćw], dr hab. M.Hajduk, _sala D1/15_",1,10:30,2h 15min,Sobota,98.61864,ćwiczenia,2,Sobota,08.10.2022_,08.10.2022_
61,409.2916,"ME [w], dr hab. L.Błaszkiewicz, _prof. UWM, sa...",1,13:00,2h 30min,Sobota,112.90572,wykład,0,Sobota,08.10.2022_,08.10.2022_


In [39]:
df_tmp['Week_day'].str.split(' ', expand=True)

Unnamed: 0,0,1
57,Sobota,08.10.2022_
58,Sobota,08.10.2022_
59,Sobota,08.10.2022_
60,Sobota,08.10.2022_
61,Sobota,08.10.2022_
62,Sobota,08.10.2022_
63,Sobota,08.10.2022_
79,Niedziela,09.10.2022_
80,Niedziela,09.10.2022_
81,Niedziela,09.10.2022_


In [37]:
df_tmp.columns

Index(['y', 'Przedmiot', 'Strona', 'Godzina rozpoczęcia', 'Duration',
       'Week_day', 'length_x', 'Rodzaj_zajecia', 'Grupa'],
      dtype='object')

In [None]:
"""
54 - 1.5h
104 = 2.5 h
60 -1.5h
69 - 1.5h
88 - 2h 15min
"""