In [33]:
from collections import Counter
import numpy as np
import pandas as pd
import enum
from io import StringIO
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer

# Open a PDF file.
fp = open('plan5_2.pdf', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)

info = []

def parse_obj(lt_objs, index):

    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            #print(obj)
            print ("%6d, %6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.get_text().replace('\n', '_')))
            info.append((obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.get_text().replace('\n', '_'), index))
        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)

# loop over all pages in the document
for i, page in enumerate(PDFPage.create_pages(document)):

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    # extract text from this object
    parse_obj(layout._objs, i)

   309,    516,    558, Plan NS sem. zimowy 2020/2021_Informatyka, NS - I st., rok III, semestr 5_
   381,    488,    485, Sobota 03.10.2020_
    85,    473,    114, 07:00_
   137,    473,    165, 08:00_
   188,    473,    216, 09:00_
   239,    473,    268, 10:00_
   291,    473,    319, 11:00_
   342,    473,    370, 12:00_
   394,    473,    422, 13:00_
   445,    473,    473, 14:00_
   496,    473,    524, 15:00_
   548,    473,    576, 16:00_
   599,    473,    627, 17:00_
   650,    473,    679, 18:00_
   702,    473,    730, 19:00_
   753,    473,    781, 20:00_
    85,    336,    114, 07:00_
   137,    336,    165, 08:00_
   188,    336,    216, 09:00_
   239,    336,    268, 10:00_
   291,    336,    319, 11:00_
   342,    336,    370, 12:00_
   394,    336,    422, 13:00_
   445,    336,    473, 14:00_
   496,    336,    524, 15:00_
   548,    336,    576, 16:00_
   599,    336,    627, 17:00_
   650,    336,    679, 18:00_
   702,    336,    730, 19:00_
   753,    336,    78

In [171]:
def get_hour_from_cord_tekst(val: str) -> str:
    if len(hour_definition[hour_definition.x <= val]):
        return hour_definition[hour_definition.x <= val].iloc[-1].Przedmiot
    return -1
def get_hour_from_cord_x(val: str) -> str:
    if len(hour_definition[hour_definition.x <= val]):
        return hour_definition[hour_definition.x <= val].iloc[-1].x
    return -1
def map_hour(hour: str, diff: int):
    if len(str(hour)) < 3:
        return -1
    hour = int(hour[:2])
    if diff > 38:
        return f'{hour + 1}:00'
    if diff < 13:
        return f'{hour}:15'
    if diff < 26:
        return f'{hour}:30'
    return f'{hour}:45'
def map_hour_duration(diff: int):
    if diff < 71:
        return "1h 30min"
    if diff < 105:
        return "2h 15min"
    return "2h 30min"
def lookup_week_day(row):
    strona = row.Strona
    word = df_week_day[(df_week_day.y > row.y) & (df_week_day.Strona == strona)].Przedmiot
    if len(word) > 0:
        word = word.values[0]
    return "" if len(word) == 0 else word
# def map_under_text(row):
#     df_with_text = df_tmp[(
#         df_tmp.Przedmiot.str.contains('\[') == False) 
#         & (df_tmp.y < row.y) 
#         & (df_tmp.y > row.y - 30) 
#         & (df_tmp.x > row.x - hour_diff//2) 
#         & (df_tmp.x < row.x + hour_diff)
#     ].append(row).sort_index()
#     return ' '.join(df_with_text.Przedmiot.values).replace('_', '')
arr = np.array(info)
df = pd.DataFrame(arr)
df.columns = ['x', 'y', 'x_end', 'Przedmiot', 'Strona']
df.x = df.x.astype('float')
df.y = df.y.astype('float')
df.x_end = df.x_end.astype('float')
df['length_x'] = df.x_end - df.x
df

hour_definition = df[(df['Strona'] == '1') & (df['Przedmiot'].str.contains('00_'))]
hour_definition.drop_duplicates(subset=['Przedmiot'], inplace=True)
hour_definition.reset_index(drop=True, inplace=True)
hour_definition
hour = hour_definition.iloc[:2,:].x
hour_diff = int(hour[1] - hour[0])
hour_diff

df_week_day = df[['y', 'Przedmiot', 'Strona']]
df_week_day = df_week_day[(df_week_day.Przedmiot.str.contains('Sobota')) | (df_week_day.Przedmiot.str.contains('Niedziela'))]

#df_tmp = df[(df['Strona'] == '1') | (df['Strona'] == '2')]
df_tmp = df
df_tmp['hour_mapped'] = list(map(get_hour_from_cord_tekst, list(df_tmp.x)))
df_tmp['x_cord_mapped'] = list(map(get_hour_from_cord_x, list(df_tmp.x)))
df_tmp['hour_diff'] = df_tmp.x - df_tmp.x_cord_mapped
df_tmp['Godzina rozpoczęcia'] = df_tmp.apply(lambda x: map_hour(x['hour_mapped'], x['hour_diff']), axis=1)
#df_tmp.Przedmiot = df_tmp.apply(lambda x: map_under_text(x), axis=1)
df_tmp['Duration'] = df_tmp.length_x.apply(lambda x: map_hour_duration(x))
df_tmp['Week_day'] = df_tmp.apply(lambda x: lookup_week_day(x), axis=1)
#df_tmp
df_tmp = df_tmp[df_tmp.Przedmiot.str.contains('\[') == True][['y', 'Przedmiot', 'Strona', 'Godzina rozpoczęcia', 'Duration', 'Week_day', 'length_x']]
#df_tmp[['y', 'Przedmiot', 'Strona', 'Godzina rozpoczęcia', 'Duration', 'Week_day']]
df_tmp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,y,Przedmiot,Strona,Godzina rozpoczęcia,Duration,Week_day,length_x
54,404.1016,"ME [ćw], dr _M.Wojciechowski, _sala C1_",1,11:00,1h 30min,Sobota 10.10.2020_,66.36216
55,404.1016,"ME [ćw], dr _M.Wojciechowski, _sala C1_",1,12:45,1h 30min,Sobota 10.10.2020_,66.36216
72,271.9616,"WdGM [w], dr hab. _A.Denisiuk_",1,8:00,1h 30min,Sobota 10.10.2020_,69.37716
73,271.9616,"WdGM [w], dr hab. _A.Denisiuk_",1,9:45,1h 30min,Sobota 10.10.2020_,69.37716
74,271.9616,"ASK [w], dr _J.Szubiakowski_",1,11:30,1h 30min,Sobota 10.10.2020_,54.88908
...,...,...,...,...,...,...,...
786,409.2616,"WdGM [ćw], dr inż. _M.Bodzioch_",17,9:45,1h 30min,Sobota 30.01.2021_,70.33392
787,409.2616,"ERI [w], dr inż. _B.Nowak_",17,11:45,1h 30min,Sobota 30.01.2021_,54.50316
788,409.2616,"ME [w], dr hab. _L.Błaszkiewicz_",17,13:15,1h 30min,Sobota 30.01.2021_,56.18352
789,409.2616,"ME [w], dr hab. _L.Błaszkiewicz_",17,15:00,1h 30min,Sobota 30.01.2021_,56.18352


In [165]:
df_tmp.to_csv('out2.csv')

In [143]:
df_tmp

Unnamed: 0,y,Przedmiot,Strona,Godzina rozpoczęcia,Duration,Week_day
54,404.1016,"ME [ćw], dr _M.Wojciechowski, _sala C1_",1,11:00,1h 30min,"38 Sobota 10.10.2020_ Name: Przedmiot, dtyp..."
55,404.1016,"ME [ćw], dr _M.Wojciechowski, _sala C1_",1,12:45,1h 30min,"38 Sobota 10.10.2020_ Name: Przedmiot, dtyp..."
72,271.9616,"WdGM [w], dr hab. _A.Denisiuk_",1,8:00,1h 30min,38 Sobota 10.10.2020_ 70 Niedziela 11...
73,271.9616,"WdGM [w], dr hab. _A.Denisiuk_",1,9:45,1h 30min,38 Sobota 10.10.2020_ 70 Niedziela 11...
74,271.9616,"ASK [w], dr _J.Szubiakowski_",1,11:30,1h 30min,38 Sobota 10.10.2020_ 70 Niedziela 11...
75,271.9616,"ASK [ćw], dr _J.Szubiakowski_",1,13:15,1h 30min,38 Sobota 10.10.2020_ 70 Niedziela 11...
76,277.1216,"WWW [ćw], mgr H.Pikus_",1,15:15,2h 15min,38 Sobota 10.10.2020_ 70 Niedziela 11...
99,414.4516,"IO [w], dr M.Kolev_",2,12:15,1h 30min,"82 Sobota 17.10.2020_ Name: Przedmiot, dtyp..."
100,409.2616,"IO [ćw], mgr inż. _M.Żużel_",2,14:00,1h 30min,"82 Sobota 17.10.2020_ Name: Przedmiot, dtyp..."
101,409.2616,"IO [ćw], mgr inż. _M.Żużel_",2,15:45,1h 30min,"82 Sobota 17.10.2020_ Name: Przedmiot, dtyp..."


In [150]:
df_week_day

Unnamed: 0,y,Przedmiot,Strona
1,488.5,Sobota 03.10.2020_,0
30,351.19,Niedziela 04.10.2020_,0
38,488.5,Sobota 10.10.2020_,1
70,351.19,Niedziela 11.10.2020_,1
82,488.5,Sobota 17.10.2020_,2
117,351.19,Niedziela 18.10.2020_,2
128,488.5,Sobota 24.10.2020_,3
165,351.19,Niedziela 25.10.2020_,3
180,488.5,Sobota 31.10.2020_,4
209,351.19,Niedziela 01.11.2020_,4


In [104]:
lookup_week_day(df_tmp.loc[121])

82        Sobota 17.10.2020_
117    Niedziela 18.10.2020_
Name: Przedmiot, dtype: object

In [110]:
df_week_day[df_week_day.Strona=="2"].iloc[0]

y                         488.5
Przedmiot    Sobota 17.10.2020_
Strona                        2
Name: 82, dtype: object

In [103]:
min(df_week_day[df_week_day.Strona=="2"].y)

351.19

In [None]:
df_week_day[df_week_day.Strona=="2"]

In [None]:
df_tmp.apply(lambda x: x)

In [84]:
df_week_day

Unnamed: 0,y,Przedmiot,Strona
1,488.5,Sobota 03.10.2020_,0
30,351.19,Niedziela 04.10.2020_,0
38,488.5,Sobota 10.10.2020_,1
70,351.19,Niedziela 11.10.2020_,1
82,488.5,Sobota 17.10.2020_,2
117,351.19,Niedziela 18.10.2020_,2
128,488.5,Sobota 24.10.2020_,3
165,351.19,Niedziela 25.10.2020_,3
180,488.5,Sobota 31.10.2020_,4
209,351.19,Niedziela 01.11.2020_,4


Unnamed: 0,y,Przedmiot,Strona
1,488.5,Sobota 03.10.2020_,0
30,351.19,Niedziela 04.10.2020_,0
38,488.5,Sobota 10.10.2020_,1
70,351.19,Niedziela 11.10.2020_,1
82,488.5,Sobota 17.10.2020_,2
117,351.19,Niedziela 18.10.2020_,2
128,488.5,Sobota 24.10.2020_,3
165,351.19,Niedziela 25.10.2020_,3
180,488.5,Sobota 31.10.2020_,4
209,351.19,Niedziela 01.11.2020_,4


In [None]:
df_tmp.to_csv('output1.csv', index=False)

In [None]:
"""
54 - 1.5h
104 = 2.5 h
60 -1.5h
69 - 1.5h
88 - 2h 15min
"""