In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import re
from io import StringIO

#Libraries for feature extraction and topic modeling
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

### Starting the PDF table Extraction

In [None]:
!pip3 install camelot-py[cv] tabula-py

In [None]:
import camelot 
import tabula

In [None]:
!pip install pikepdf

In [None]:
import pikepdf
import pandas as pd
import numpy as np
#pdf = pikepdf.open(file,password='ABDPM5030G')
#pdf.save('book_without_pass.pdf')
import warnings
warnings.filterwarnings('ignore')

In [None]:
def clean_df(my_table):
    my_table = my_table[2:]
    my_table.drop([0,1,3,8,12],axis=1,inplace=True)
    return my_table

In [None]:
def select_tables(ca_table):
    calc_tab = pd.DataFrame()
    for table in ca_table:
        if table.df.shape[1] == 13:
            calc_tab = pd.concat([calc_tab,table.df])
    calc_tab = clean_df(calc_tab)
    return calc_tab

In [None]:
def correct_txn(dataframe):
    for index,trans in enumerate(dataframe.transaction):
        if '\n' in trans:
            txn_len = len(trans.split('\n'))
            #print(index)
            for i in range(txn_len):
                dataframe.loc[index+i,'transaction'] = trans.split('\n')[i]
    return dataframe

In [None]:
def correct_qty(dataframe):
    for index,qty in enumerate(dataframe.qty):
        if '\n' in qty:
            txn_len = len(qty.split('\n'))
            #print(index)
            for i in range(txn_len):
                dataframe.loc[index+i,'qty'] = qty.split('\n')[i]
    return dataframe

In [None]:
#The below function takes the raw file and outputs the cleaned dataframe
def get_data(file):
    pdf = pikepdf.open(file,password='ABDPM5030G')
    pdf.save('book_without_pass.pdf')
    ca_tables = camelot.read_pdf('./book_without_pass.pdf',pages='all')
    my_data = select_tables(ca_tables)
    my_data.columns = ['order_no','script','transaction',
                   'qty','price','price_d','price_close','PnL']
    my_data = my_data[my_data.order_no != '']
    my_data = my_data.reset_index()
    my_data.drop('index',axis=1,inplace=True)
    my_data = correct_qty(my_data)
    my_data.qty = my_data.qty.astype('float')
    my_data.price = my_data.price.astype('float')
    return my_data # Try to take the raw data as much as possible, to avoid losing control

In [None]:
my_table = get_data('../input/contract-notes/2022-03-08-contract-notes_DM3955.pdf')
my_table

In [None]:
#important is check the unique transaction types. Funny things can happen
my_table.transaction.unique()

In [None]:
def todays_forward(carry_forward, day_df):
    trade_count = day_df.groupby(['script','transaction'])['qty'].sum().reset_index()
    #Cancelling out the trades of the day
    traded_scripts = trade_count.script.unique()
    transaction = []
    txn_qty = []
    for script in traded_scripts:
        temp = trade_count[trade_count.script == script]
        #which is more?
        if (temp.qty.values[0] > temp.qty.values[-1]):
            x = temp.qty.values[0] - temp.qty.values[-1]
            transaction.append('B')
            txn_qty.append(x)
        else:    
            x = temp.qty.values[-1] - temp.qty.values[0]
            transaction.append('S')
            txn_qty.append(x)
    #Creating the pending trades of the day
    traded_df = pd.DataFrame({'script' : traded_scripts,
                         'txn_type' : transaction,
                         'txn_qty' : txn_qty
                         })
    final_df = traded_df.merge(right=carry_f,left_on='script',right_on='script',how='left')
    final_df['remaining_qty'] = abs(final_df.txn_qty - final_df.qty)
    final_df.drop(['order_no','price','price_d','price_close','PnL'],axis=1,inplace=True)
    return final_df

In [None]:
#decide the script
def locate_script(day_df,script,qty,transaction):
    locating = day_df[(day_df.script == script)&(day_df.transaction == transaction)]
    return locating.iloc[-qty:,:]
            #day_dy[-3:,:] will only work

In [None]:
#Seperating the carry forwarded scripts
carry_f = my_table[my_table.order_no == '0']
carry_f

In [None]:
day_df = my_table[my_table.order_no != '0']
day_df

In [None]:
today_forward = todays_forward(carry_forward=carry_f,day_df=day_df)
today_forward

### Now we know which scripts are finally pending at the day's end

Locate the final trades in that script

In [None]:
script_dataframe = pd.DataFrame()
for script in today_forward.loc[today_forward.remaining_qty != 0,'script']:
    qty_tofun = today_forward.loc[today_forward.script == script,'remaining_qty'].values[0]
    #print(qty_tofun)
    txn_tofun = today_forward.loc[today_forward.script == script,'transaction'].values[0]
    #print(txn_tofun)
    temp = locate_script(day_df=day_df,script=script,qty=qty_tofun,transaction=txn_tofun)
    script_dataframe = script_dataframe.append([temp])

In [None]:
script_dataframe.price = script_dataframe.price.astype('float')
script_dataframe.price_close = script_dataframe.price_close.astype('float') 
script_dataframe['PnL'] = script_dataframe.price - script_dataframe.price_close
script_dataframe

In [None]:
def closing_scripts(file):
    my_table = get_data(file)
    
    my_table = correct_txn(my_table) # correct the transaction errors
    print('corrected transactions')
    carry_f = my_table[my_table.order_no == '0'] # get carry forwarded tables
    
    day_df = my_table[my_table.order_no != '0'] # get day's trade
    
    today_forward = todays_forward(carry_forward=carry_f,day_df=day_df) # get day's forwarded trade table
    print('got today forwarded table')
    #Iterating over the trade table to locate the final trades of the day
    script_dataframe = pd.DataFrame()
    for script in today_forward.loc[today_forward.remaining_qty != 0,'script']:
        qty_tofun = today_forward.loc[today_forward.script == script,'remaining_qty'].values[0]
        #print(qty_tofun)
        txn_tofun = today_forward.loc[today_forward.script == script,'transaction'].values[0]
        #print(txn_tofun)
        temp = locate_script(day_df=day_df,script=script,qty=qty_tofun,transaction=txn_tofun)
        script_dataframe = script_dataframe.append([temp])
    print('Got the script data frame for final processing')
    #final leg of the calculation to get the list of trades getting forwarded
    script_dataframe.price = script_dataframe.price.astype('float')
    script_dataframe.price_close = script_dataframe.price_close.astype('float') 
    script_dataframe['PnL'] = script_dataframe.price - script_dataframe.price_close
    print('returning the solution')
    return script_dataframe

In [None]:
my_table = get_data('../input/contract-notes/2022-03-11-contract-notes_DM3955.pdf')
my_table

In [None]:
pdf = pikepdf.open('../input/contract-notes/2022-03-14-contract-notes_DM3955.pdf',
                   password='ABDPM5030G')
pdf.save('book_without_pass.pdf')
ca_tables = camelot.read_pdf('./book_without_pass.pdf',pages='all')
my_data = select_tables(ca_tables)

In [None]:
import os
directory = '../input/contract-notes'
for filename in os.listdir(directory):
    if filename.endswith(".pdf"): 
         print(os.path.join(directory, filename))

In [None]:
my_data