In [7]:
import os
import re
import ast
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [33]:
BASE_PATH = r'..\data\final'
TRAIN_PATH = os.path.join(BASE_PATH,"train.csv")
VALID_PATH = os.path.join(BASE_PATH,"valid.csv")
TEST_PATH = os.path.join(BASE_PATH,"test.csv")

In [34]:
train_data = pd.read_csv(TRAIN_PATH)
valid_data = pd.read_csv(VALID_PATH)
test_data = pd.read_csv(TEST_PATH)

In [35]:
train_data.head()

Unnamed: 0,id,tokens,ner_tags,num_tokens
0,0,"['ITEM', '1', 'Financial', 'Statements', 'Lenn...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['ITEM', '[NUM]', 'Financial', 'Statements', '..."
1,1,"['See', 'accompanying', 'notes', 'to', 'conden...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['See', 'accompanying', 'notes', 'to', 'conden..."
2,2,"['The', 'condensed', 'consolidated', 'financia...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['The', 'condensed', 'consolidated', 'financia..."
3,3,"['These', 'condensed', 'consolidated', 'financ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['These', 'condensed', 'consolidated', 'financ..."
4,4,"['The', 'condensed', 'consolidated', 'statemen...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['The', 'condensed', 'consolidated', 'statemen..."


In [36]:
train_data.shape, valid_data.shape, test_data.shape

((900384, 4), (112494, 4), (108378, 4))

In [37]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900384 entries, 0 to 900383
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          900384 non-null  int64 
 1   tokens      900384 non-null  object
 2   ner_tags    900384 non-null  object
 3   num_tokens  900384 non-null  object
dtypes: int64(1), object(3)
memory usage: 27.5+ MB


In [38]:
for sent in train_data['num_tokens'].loc[0:5]:
    print(type(sent))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [13]:
train_data['num_tokens'] = train_data['num_tokens'].str.lower()

In [14]:
valid_data['num_tokens'] = valid_data['num_tokens'].str.lower()

In [15]:
test_data['num_tokens'] = test_data['num_tokens'].str.lower()

In [16]:
train_data.head()

Unnamed: 0,id,tokens,ner_tags,num_tokens
0,0,"['ITEM', '1', 'Financial', 'Statements', 'Lenn...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['item', '[num]', 'financial', 'statements', '..."
1,1,"['See', 'accompanying', 'notes', 'to', 'conden...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['see', 'accompanying', 'notes', 'to', 'conden..."
2,2,"['The', 'condensed', 'consolidated', 'financia...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['the', 'condensed', 'consolidated', 'financia..."
3,3,"['These', 'condensed', 'consolidated', 'financ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['these', 'condensed', 'consolidated', 'financ..."
4,4,"['The', 'condensed', 'consolidated', 'statemen...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['the', 'condensed', 'consolidated', 'statemen..."


In [24]:
for sent in train_data['tokens'].iloc[:5]:
    print(len(sent))

2934
1706
406
311
278


In [25]:
for sent in train_data['num_tokens'].iloc[:5]:
    print(len(sent))

2974
1729
409
315
282


In [28]:
for sent in train_data['ner_tags'].iloc[:5]:
    print(type(sent))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
