In [29]:
import numpy as np
import pandas as pd
import string
import re

In [30]:
with open('businessCard.txt',mode='r',encoding='utf8',errors='ignore') as f:
    text = f.read()

In [31]:
data = list(map(lambda x:x.split('\t'),text.split('\n')))

In [32]:
df = pd.DataFrame(data[1:],columns=data[0])

In [33]:
df.head(10)

Unnamed: 0,id,text,tag
0,001.jpeg,CERTIFICATE,B-TYP
1,001.jpeg,INTERNSHIP,I-TYP
2,001.jpeg,We,B-DES
3,001.jpeg,present,I-DES
4,001.jpeg,this,I-DES
5,001.jpeg,certificate,I-DES
6,001.jpeg,to,I-DES
7,001.jpeg,Harumi,B-STU
8,001.jpeg,Kobayashi,I-STU
9,001.jpeg,in,B-DES


### Cleaning Text
- Remove white space
- Remove Unwanted special characters

In [34]:
whitespace = string.whitespace
punctuation = "!#$%&\'()*+:;<=>?[\\]^`{|}~"
tableWhitespace = str.maketrans('','',whitespace)
tablePunctuation = str.maketrans('','',punctuation)
def cleanText(txt):
    text = str(txt)
    text = text.lower()
    removewhitespace = text.translate(tableWhitespace)
    removepunctuation = removewhitespace.translate(tablePunctuation)
    
    return str(removepunctuation)


In [35]:
df['text'] = df['text'].apply(cleanText)

In [36]:
dataClean = df.query("text != '' ")
dataClean.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataClean.dropna(inplace=True)


In [37]:
dataClean.head(10)

Unnamed: 0,id,text,tag
0,001.jpeg,certificate,B-TYP
1,001.jpeg,internship,I-TYP
2,001.jpeg,we,B-DES
3,001.jpeg,present,I-DES
4,001.jpeg,this,I-DES
5,001.jpeg,certificate,I-DES
6,001.jpeg,to,I-DES
7,001.jpeg,harumi,B-STU
8,001.jpeg,kobayashi,I-STU
9,001.jpeg,in,B-DES


### Convert Data into Spacy Format

In [38]:
group = dataClean.groupby(by='id')

In [39]:
cards = group.groups.keys()

In [40]:
allCardsData = []
for card in cards:
    cardData = []
    grouparray = group.get_group(card)[['text','tag']].values
    content = ''
    annotations = {'entities':[]}
    start = 0
    end = 0
    for text, label in grouparray:
        text = str(text)
        stringLength = len(text) + 1

        start = end
        end = start + stringLength

        if label != 'O':
            annot = (start,end-1,label)
            annotations['entities'].append(annot)

        content = content + text + ' '
        
        
    cardData = (content,annotations)
    allCardsData.append(cardData)

In [41]:
allCardsData

[('certificate internship we present this certificate to harumi kobayashi in appreciation for your successful work as an intern at studio shodwe. the internship was conducted between il "july," 2023 and 11 "september," 2023 20 "september," 2023 director ',
  {'entities': [(0, 11, 'B-TYP'),
    (12, 22, 'I-TYP'),
    (23, 25, 'B-DES'),
    (26, 33, 'I-DES'),
    (34, 38, 'I-DES'),
    (39, 50, 'I-DES'),
    (51, 53, 'I-DES'),
    (54, 60, 'B-STU'),
    (61, 70, 'I-STU'),
    (71, 73, 'B-DES'),
    (74, 86, 'I-DES'),
    (87, 90, 'I-DES'),
    (91, 95, 'I-DES'),
    (96, 106, 'I-DES'),
    (107, 111, 'I-DES'),
    (112, 114, 'I-DES'),
    (115, 117, 'I-DES'),
    (118, 124, 'I-DES'),
    (125, 127, 'I-DES'),
    (128, 134, 'B-COI'),
    (135, 142, 'I-COI'),
    (143, 146, 'B-DES'),
    (147, 157, 'I-DES'),
    (158, 161, 'I-DES'),
    (162, 171, 'I-DES'),
    (172, 179, 'I-DES'),
    (180, 182, 'B-DAT'),
    (183, 190, 'I-DAT'),
    (191, 195, 'I-DAT'),
    (196, 199, 'I-DAT'),
    (200,

In [42]:
card_data_df = pd.DataFrame(allCardsData,columns=['text','labels'])
card_data_df['isNull'] = card_data_df['labels'].apply(lambda x: 'Null' if len(x['entities']) ==0 
                                                      else 'Clean')

#### Null entries need to drop

In [43]:
card_data_df.query('isNull == "Null"')

Unnamed: 0,text,labels,isNull
159,ielts test report form academic note to underg...,{'entities': []},Null


#### Consilder only clean data

In [44]:
card_data_df.dropna(inplace=True)
clean_data = card_data_df.query('isNull == "Clean"')[['text','labels']]

In [45]:
allCardsData = list(map(lambda x: tuple(x), clean_data.values.tolist()))

## Split the Data into Training and Testing Set

In [46]:
import random

In [47]:
random.shuffle(allCardsData)

In [48]:
len(allCardsData)

216

In [54]:
TrainData = allCardsData[:180]
TestData = allCardsData[180:]

In [55]:
import pickle

In [56]:
pickle.dump(TrainData,open('data/TrainData.pickle',mode='wb'))
pickle.dump(TestData,open('data/TestData.pickle',mode='wb'))