In [138]:
import os
import re
import ast
import numpy as np
import pandas as pd
import tensorflow

In [None]:
BASE_PATH = r'..\data\raw'
TRAIN_PATH = os.path.join(BASE_PATH,"train.csv")
VALID_PATH = os.path.join(BASE_PATH,"validation.csv")
TEST_PATH = os.path.join(BASE_PATH,"test.csv")

In [140]:
train_data = pd.read_csv(TRAIN_PATH)
valid_data = pd.read_csv(VALID_PATH)
test_data = pd.read_csv(TEST_PATH)

In [141]:
train_data.shape, valid_data.shape, test_data.shape

((900384, 3), (112494, 3), (108378, 3))

In [None]:
# ast.literal_eval(train_data['ner_tags'].values)
train_data['ner_tags'] = train_data['ner_tags'].apply(ast.literal_eval)
valid_data['ner_tags'] = valid_data['ner_tags'].apply(ast.literal_eval)
test_data['ner_tags'] = test_data['ner_tags'].apply(ast.literal_eval)

In [None]:
train_data['tokens'] = train_data['tokens'].apply(ast.literal_eval)
valid_data['tokens'] = valid_data['tokens'].apply(ast.literal_eval)
test_data['tokens'] = test_data['tokens'].apply(ast.literal_eval)

In [None]:
len(train_data.loc[0]['tokens']), len(train_data.loc[0]['ner_tags'])

In [102]:
tokens = train_data.loc[0]['tokens']

In [130]:
def is_number(token):
    number_patterns = [
        r"^\d+(\.\d{1,9})?$",            # matches numbers like: 1, 1.1, 1.12, ... up to 9 decimal places
        r"^\d{1,3}(,\d{3})*(\.\d+)?$"    # matches numbers like: 1,123, 1,123.12, 12,123.12, 123,123,123.123, ...
    ]
    
    return any(re.match(pattern, token) for pattern in number_patterns)

def replace_with_num(tokens):
    return ["[NUM]" if is_number(token) else token for token in tokens]

In [131]:
train_data['num_tokens'] = train_data['tokens'].apply(replace_with_num)

In [132]:
valid_data['num_tokens'] = valid_data['tokens'].apply(replace_with_num)

In [133]:
test_data['num_tokens'] = test_data['tokens'].apply(replace_with_num)

In [134]:
test_data.head()

Unnamed: 0,id,tokens,ner_tags,num_tokens
0,1012878,"[The, changes, in, the, fair, value, of, the, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, changes, in, the, fair, value, of, the, ..."
1,1012879,"[Fair, Values, Financial, Assets, and, Financi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Fair, Values, Financial, Assets, and, Financi..."
2,1012880,"[23, Table, of, Contents, AMERICAN, EXPRESS, C...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[NUM], Table, of, Contents, AMERICAN, EXPRESS..."
3,1012881,"[The, fair, values, of, these, financial, inst...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, fair, values, of, these, financial, inst..."
4,1012882,"[(, b, ), Level, 1, amounts, reflect, interest...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(, b, ), Level, [NUM], amounts, reflect, inte..."


In [135]:
# Specify your desired path
path1 = r"..\data\final\train.csv"
path2 = r"..\data\final\test.csv"
path3 = r"..\data\final\valid.csv"

# Save the DataFrame to the specified path
train_data.to_csv(path1, index=False)
test_data.to_csv(path2, index=False)
valid_data.to_csv(path3, index=False)


In [136]:
test_data.head()

Unnamed: 0,id,tokens,ner_tags,num_tokens
0,1012878,"[The, changes, in, the, fair, value, of, the, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, changes, in, the, fair, value, of, the, ..."
1,1012879,"[Fair, Values, Financial, Assets, and, Financi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Fair, Values, Financial, Assets, and, Financi..."
2,1012880,"[23, Table, of, Contents, AMERICAN, EXPRESS, C...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[NUM], Table, of, Contents, AMERICAN, EXPRESS..."
3,1012881,"[The, fair, values, of, these, financial, inst...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[The, fair, values, of, these, financial, inst..."
4,1012882,"[(, b, ), Level, 1, amounts, reflect, interest...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(, b, ), Level, [NUM], amounts, reflect, inte..."


In [137]:
train_data.iloc[0]['tokens']

['ITEM',
 '1',
 'Financial',
 'Statements',
 'Lennar',
 'Corporation',
 'and',
 'Subsidiaries',
 'Condensed',
 'Consolidated',
 'Balance',
 'Sheets',
 '(',
 'Dollars',
 'in',
 'thousands',
 ',',
 'except',
 'shares',
 'and',
 'per',
 'share',
 'amounts',
 ')',
 '(',
 'unaudited',
 ')',
 '(',
 '1',
 ')',
 'Under',
 'certain',
 'provisions',
 'of',
 'Accounting',
 'Standards',
 'Codification',
 '(',
 '“',
 'ASC',
 '”',
 ')',
 'Topic',
 '810',
 ',',
 'Consolidations',
 ',',
 '(',
 '“',
 'ASC',
 '810',
 '”',
 ')',
 'the',
 'Company',
 'is',
 'required',
 'to',
 'separately',
 'disclose',
 'on',
 'its',
 'condensed',
 'consolidated',
 'balance',
 'sheets',
 'the',
 'assets',
 'owned',
 'by',
 'consolidated',
 'variable',
 'interest',
 'entities',
 '(',
 '“',
 'VIEs',
 '”',
 ')',
 'and',
 'liabilities',
 'of',
 'consolidated',
 'VIEs',
 'as',
 'to',
 'which',
 'neither',
 'Lennar',
 'Corporation',
 ',',
 'or',
 'any',
 'of',
 'its',
 'subsidiaries',
 ',',
 'has',
 'any',
 'obligations',
 '.'

In [118]:
# from transformers import BertTokenizer, TFBertForTokenClassification
# import numpy as np

# model_name = "bert-base-uncased"
# model = TFBertForTokenClassification.from_pretrained(model_name)
# tokenizer = BertTokenizer.from_pretrained(model_name)

In [119]:
# train_data.iloc[0]['tokens']

In [120]:
# ''.join(train_data.iloc[0]['tokens'])

In [121]:
# len(train_data.iloc[0]['tokens']), len(train_data.iloc[0]['ner_tags'])

In [122]:
# s = train_data.iloc[0]['tokens']

In [123]:
# lst = ast.literal_eval(s)

In [124]:
# ' '.join(lst)

In [125]:
# len(lst)

In [126]:
# len(lst1)