# Process Raw datas into a single text file data

In [1]:
import os
os.chdir('../../')

In [2]:
import re
import hazm
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
all_names = []

In [4]:
class Normalizer:
    def __init__(self) -> None:
        self.hazm_normalizer = hazm.Normalizer()

    def normalize(self, text):
        text = re.sub(r'\([^)]*\)', '', text) # remove whatever inside parenthesis
        text = self.hazm_normalizer.normalize(text)
        text = text.replace('\u200c', ' ')
        text = text.replace('"', '')
        text = text.replace("'", '')
        text = re.sub('[0-9a-zA-Z]+', '', text)
        
        if self.is_valid(text):
            return text
        else:
            return None

    def is_valid(self, text):
        if len(text) <= 2:
            return False
        
        if len(text) > 10:
            return False
        
        return True

In [5]:
normalizer = Normalizer()

In [6]:
file_name = './Data/Raw/01.txt'
name_list = []


with open(file_name, 'r') as f:
    lines = f.readlines()
    for line in lines:
        name = line.split(":")[0]
        if normalizer.is_valid(name):
            normalized_name = normalizer.normalize(name)
            if normalized_name is not None:
                name_list.append(normalized_name)

all_names += name_list
print(len(name_list))

714


In [7]:
file_name = './Data/Raw/02.txt'
name_list = []


with open(file_name, 'r') as f:
    lines = f.readlines()
    for line in lines:
        name = line.split(":")[0]
        if normalizer.is_valid(name):
            normalized_name = normalizer.normalize(name)
            if normalized_name is not None:
                name_list.append(normalized_name)

all_names += name_list
print(len(name_list))

977


In [8]:
file_name = './Data/Raw/03.html'
name_list = []

with open(file_name, 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')
    tr_list = soup.findAll('tr')
    
    for tr in tr_list:
        td_list = tr.findAll('td')
        if not td_list[0].find('h3'):
            name = td_list[0].get_text()
            if normalizer.is_valid(name):
                normalized_name = normalizer.normalize(name)
                if normalized_name is not None:
                    name_list.append(normalized_name)

all_names += name_list
print(len(name_list))

712


In [9]:
file_name = './Data/Raw/04.html'
name_list = []

with open(file_name, 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')
    h4_list = soup.findAll("h4", class_='content_h4')
    for h4 in h4_list:
        name = h4.get_text()
        if normalizer.is_valid(name):
            normalized_name = normalizer.normalize(name)
            if normalized_name is not None:
                name_list.append(normalized_name)

all_names += name_list
print(len(name_list))

2801


In [10]:
file_name = './Data/Raw/05.html'
name_list = []

with open(file_name, 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')
    h4_list = soup.findAll("cite", class_='fn')
    for h4 in h4_list:
        name = h4.get_text()
        if normalizer.is_valid(name):
            normalized_name = normalizer.normalize(name)
            if normalized_name is not None:
                name_list.append(normalized_name)

all_names += name_list
print(len(name_list))

29


In [11]:
file_name = './Data/Raw/06.html'
name_list = []

with open(file_name, 'r') as f:
    soup = BeautifulSoup(f, 'html.parser')
    tr_list = soup.findAll('tr')
    for tr in tr_list:
        td_list = tr.findAll('td')
        name = td_list[0].get_text()
        normalized_name = normalizer.normalize(name)
        if normalized_name is not None:
            name_list.append(normalized_name)
            
all_names += name_list
print(len(name_list))

476


In [12]:
file_name = './Data/Raw/07.csv'
name_list = []

data = pd.read_csv(file_name)
for naem in data['first_name'].to_list():
    normalized_name = normalizer.normalize(name)
    if normalized_name is not None:
        name_list.append(normalized_name)

all_names += name_list
print(len(name_list))

4055


In [13]:
file_name = './Data/Raw/08.txt'
name_list = []


with open(file_name, 'r') as f:
    lines = f.readlines()
    for line in lines:
        name = line.split(',')[1]
        if normalizer.is_valid(name):
            normalized_name = normalizer.normalize(name)
            if normalized_name is not None:
                name_list.append(normalized_name)

all_names += name_list
print(len(name_list))

5634


In [14]:
all_names = list(set(all_names))
all_names.sort()
len(all_names)

6621

In [15]:
file_name = './Data/Processed/names.txt'

with open(file_name, "w") as f:
    for name in all_names:    
        f.write(name)
        f.write('\n')