In [55]:
import re
import pdfplumber
import pandas as pd

In [56]:
pdf_file = "ICOEN.pdf"

In [57]:
data = []

In [58]:
# Open the PDF
with pdfplumber.open(pdf_file) as pdf:
    for page in pdf.pages:
        # Extract tables from the page
        tables = page.extract_tables()
        for table in tables:
            for row in table:
                # Filter out empty rows and ensure there are 2 columns
                if len(row) >= 2 and row[0] and row[1]:
                    data.append([row[0].strip(), row[1].strip()])

# Convert to DataFrame with proper column names
df = pd.DataFrame(data, columns=["Nama", "NIK/NIS"])

df.head()

Unnamed: 0,Nama,NIK/NIS
0,Nama,NIK / NIS
1,Frances Tuesday Whyte (Student S1 IBM),0106012110100
2,"Yuli Kartika Dewi, S.E., M.M., CIMA. (IBM Lect...",20170009
3,Andrew Wibisono (Student S1 IBM),0106012210106
4,Carissa Belluci (Student S1 IBM),0106012210247


In [59]:
df = df.iloc[1:].reset_index(drop=True)
df.head()

Unnamed: 0,Nama,NIK/NIS
0,Frances Tuesday Whyte (Student S1 IBM),106012110100
1,"Yuli Kartika Dewi, S.E., M.M., CIMA. (IBM Lect...",20170009
2,Andrew Wibisono (Student S1 IBM),106012210106
3,Carissa Belluci (Student S1 IBM),106012210247
4,Beatrice Michelle Valerie W. (Student S1),870230174


In [60]:
df = df[~df["Nama"].str.contains("Lecturer", case=False, na=False)].reset_index(drop=True)
df.head()

Unnamed: 0,Nama,NIK/NIS
0,Frances Tuesday Whyte (Student S1 IBM),106012110100
1,Andrew Wibisono (Student S1 IBM),106012210106
2,Carissa Belluci (Student S1 IBM),106012210247
3,Beatrice Michelle Valerie W. (Student S1),870230174
4,Attalia Salwa Sabita Yulianto (Student S1 PSY),306012310031


In [61]:
desired_length = len("0106012210247")
df = df[df["NIK/NIS"].apply(lambda x: len(x) == desired_length if isinstance(x, str) else False)]
df.head()

Unnamed: 0,Nama,NIK/NIS
0,Frances Tuesday Whyte (Student S1 IBM),106012110100
1,Andrew Wibisono (Student S1 IBM),106012210106
2,Carissa Belluci (Student S1 IBM),106012210247
4,Attalia Salwa Sabita Yulianto (Student S1 PSY),306012310031
5,Aurel Sukma Ratih (Student S1 PSY),306012310049


In [62]:
df["Nama"] = df["Nama"].apply(lambda x: re.sub(r"\(.*?\)", "", x).strip())
df.head()

Unnamed: 0,Nama,NIK/NIS
0,Frances Tuesday Whyte,106012110100
1,Andrew Wibisono,106012210106
2,Carissa Belluci,106012210247
4,Attalia Salwa Sabita Yulianto,306012310031
5,Aurel Sukma Ratih,306012310049


In [63]:
df = df[["NIK/NIS", "Nama"]]
df.head()

Unnamed: 0,NIK/NIS,Nama
0,106012110100,Frances Tuesday Whyte
1,106012210106,Andrew Wibisono
2,106012210247,Carissa Belluci
4,306012310031,Attalia Salwa Sabita Yulianto
5,306012310049,Aurel Sukma Ratih


In [64]:
df["CP (kosongkan jika default sesuai master CP)"] = 20
df.head()

Unnamed: 0,NIK/NIS,Nama,CP (kosongkan jika default sesuai master CP)
0,106012110100,Frances Tuesday Whyte,20
1,106012210106,Andrew Wibisono,20
2,106012210247,Carissa Belluci,20
4,306012310031,Attalia Salwa Sabita Yulianto,20
5,306012310049,Aurel Sukma Ratih,20


In [65]:
output_file = "ICOEN.xlsx"
df.to_excel(output_file, index=False)