In [35]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time

## Extracting from a website

In [15]:
BASE_URL = 'http://tamilcube.com/tamil-baby-names/'

In [33]:
def get_names_from_page(driver):
    names = []
    try:
        table = driver.find_element(By.XPATH, "/html/body/div[3]/table/tbody/tr/td[2]/form/table[2]/tbody/tr[2]/td/table")
        rows = table.find_elements(By.XPATH, ".//tbody/tr")[1:]  # Skip the header row

        for row in rows:
            try:
                span = row.find_element(By.XPATH, ".//span[@id]")
                name = span.text.strip()
                if name:  # Ensure the name is not empty
                    print(name)
                    names.append(name)
            except Exception as e:
                print(f"Error extracting name from row {row}: {e}")
    except Exception as e:
        print(f"Table not found or error: {e}")

    return names

def scrape_names(driver, alphabet, gender):
    all_names = set()
    
    # Construct the URL with the correct alphabet and gender
    url = f"{BASE_URL}?baby={gender}&term={alphabet}"
    driver.get(url)
    
    last_page_names = []

    while True:
        # Wait until the table is present
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "/html/body/div[3]/table/tbody/tr/td[2]/form/table[2]/tbody/tr[2]/td/table"))
            )
        except TimeoutException:
            print("Table not found on this page. Moving to the next page.")
            break  # Exit loop if the table is not found within the timeout

        # Get names from the current page
        current_page_names = get_names_from_page(driver)
        
        if not current_page_names:
            print(f"No names found for '{alphabet}' starting with '{gender}'")
            break
        
        # Add names to the set
        all_names.update(current_page_names)

        # Check if the current page names are the same as the last page names
        if set(current_page_names) == set(last_page_names):
            print("No new content detected. Stopping.")
            break

        # Update last_page_names for the next iteration
        last_page_names = current_page_names

        # Try to find the "Next" button and click it, if present
        try:
            next_buttons = driver.find_elements(By.CLASS_NAME, "btn-primary")
            next_button = None
            for button in next_buttons:
                if button.text.strip() == ">":
                    next_button = button
                    break
            
            if next_button:
                next_button.click()
                time.sleep(2)  # Wait for the page to load
            else:
                print("Next button not found or no more pages.")
                break
        except Exception as e:
            print(f"Error finding next button or no more pages: {e}")
            break

    return all_names

In [24]:
# Initialize WebDriver (ensure chromedriver is in your PATH)
CHROME_DRIVER_PATH = '/snap/bin/chromium.chromedriver'
options = webdriver.ChromeOptions()
service = ChromeService(executable_path=CHROME_DRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)

alphabets = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
genders = {"b": "boy", "g": "girl"}
all_names = []

try:
    for alphabet in alphabets:
        for gender, gender_name in genders.items():
            print(
                f"Scraping names starting with '{alphabet}' for {gender_name}s..."
            )
            names = scrape_names(driver, alphabet, gender)
            all_names.extend(names)
finally:
    driver.quit()

Scraping names starting with 'A' for boys...
Aabha
Aabharana
AadalAlagi
AadalArasi
AadalChelvi
AadalNangai
Aadarshini
Aadhya
Aadita
Aadrika
AaduMayil
Aahlaadita
Aahna
Aakanksha
Aakriti
Aaloka
Aamaal
Aamani
Aamrapali
Aanandamayee
Aanandi
Aanandita
Aanchal
Aaniya
Aapti
Aaranam
Aaratrika
Aarika
Aarini
Aarna
Aarohi
Aarti
Aarushi
Aasha
Aashalata
Aashi
Aashiyana
Aashna
Aashni
Aashritha
Aasia
Aastha
Aathmika
Aatmaja
AatralArasi
Aavani
Aavirai
Aayushi
Abha
Abha
Abhaya
Abhaya
Abhijaata
Abhijita
Abhilasha
Abhilasha
Abhimaaninee
Abhipsa
Abhira
Abhirami
Abhiri
Abhiruchi
Abhirupa
Abhisarikaa
Abhitha
Abirami
Abishta
Ablaa
Aboli
Achala
Achala
Achira
Achit
Achla
Adhira
Adhira
Adhishree
Adi-Shakti
Aditi
Aditi
Aditri
Aditri
Adrika
Adrisa
Advika
Adwita
Adwiteya
Adwitiya
Adya
Aesha
Agalvili
Agamya
Aganagai
Aghanashini
Agnishikha
Agrata
Agrima
Agriya
Ahalya
Ahila
Ahimsa
Ahladita
Ahladita
Aishani
Aishi
Aishwarya
Aja
Ajaa
Ajagandha
Ajala
Ajala
Ajastha
Ajay
Ajita
Akane
Akanksha
Akanksha
Akashleena
Akhila
Akir

TimeoutException: Message: 
Stacktrace:
#0 0x5dc544852e5a <unknown>
#1 0x5dc54453a800 <unknown>
#2 0x5dc5445890ab <unknown>
#3 0x5dc544589391 <unknown>
#4 0x5dc5445cc9f4 <unknown>
#5 0x5dc5445abbcd <unknown>
#6 0x5dc5445c9de3 <unknown>
#7 0x5dc5445ab943 <unknown>
#8 0x5dc54457c2a3 <unknown>
#9 0x5dc54457ccde <unknown>
#10 0x5dc54481babf <unknown>
#11 0x5dc54481fb96 <unknown>
#12 0x5dc544809b87 <unknown>
#13 0x5dc544820321 <unknown>
#14 0x5dc5447f15ae <unknown>
#15 0x5dc5448416e8 <unknown>
#16 0x5dc5448418f6 <unknown>
#17 0x5dc544851a61 <unknown>
#18 0x71b60bdf8ac3 <unknown>


In [37]:
len(all_names)

7182

In [38]:
# Save the scraped data to a text file
with open("Files/baby_names.txt", "w") as file:
    for name in all_names:
        file.write(name + "\n")

print("Scraping complete. Data saved to 'baby_names.txt'.")

Scraping complete. Data saved to 'baby_names.txt'.


## Extracting from the pdfs

In [None]:
# importing required classes 
from pypdf import PdfReader 
  
# creating a pdf reader object 
reader = PdfReader('Files/indianNames2.pdf') 
  
# creating a page object 
page = reader.pages[60] 
  
# extracting text from page
pdfNames = list() 
for i in range(0, len(reader.pages)): 
    page = reader.pages[i]
    s = page.extract_text()[28:].splitlines()
    for i in s:
        if '=' in i:
            pdfNames.append(i.split('=')[0])
    
pdfNames = list(set(pdfNames))
pdfNames
# all_names.extend(pdfNames)
# s = page.extract_text()[28:].splitlines()
# for i in s:
#     if '=' in i:
#         print(i.split('=')[0])

In [57]:
len(pdfNames)

6449

In [58]:
all_names.extend(pdfNames)
all_names = list(set(all_names))
len(all_names)

9901

### Adding names from a second file

In [68]:
# creating a pdf reader object 
reader = PdfReader('Files/indianNames1.pdf') 
  
# creating a page object 
page = reader.pages[0] 

pdfNames = list() 
for i in range(0, len(reader.pages)): 
    page = reader.pages[i]
    s = page.extract_text().splitlines()[1:]
    for i in s:
        if " " in i and i[0].isalpha():
            pdfNames.append(i.split()[0])
    
pdfNames = list(set(pdfNames))
pdfNames

['Aelie',
 'Veda',
 'Darshi',
 'Kaviya',
 'Lila',
 'Milan',
 'Maisel',
 'Rhemingtyn',
 'Chloe-Lynn',
 'Aeriane',
 'Karessa',
 'Karina',
 'Ashriti',
 'Hasanaat',
 'Sheza',
 'Gurrehmat',
 'Eveliina',
 'Japleen',
 'Mayssa',
 'Greeshma',
 'Kareena',
 'Lawa',
 'Anayah',
 'Lorraine',
 'Nathalie',
 'Esti',
 'Leimashri',
 'Maleeha',
 'Aaria',
 'Tavia',
 'Brileigh',
 'Davi',
 'Carina',
 'Lynnette',
 'Manya',
 'Arena',
 'Asees',
 'Charlotte-Jane',
 'Aryana',
 'Daphne-Jayne',
 'Rayaan',
 'Kacey',
 'Parleen',
 'Valora',
 'Elaira',
 'Maadhvi',
 'Albi',
 'Traeh',
 'Demaris',
 'Lucija',
 'Inomasa',
 'Karsyn',
 'Tanishq',
 'Aalis',
 'Shana',
 'Ashmeen',
 'Talei',
 'Maleyah',
 'Bérénice',
 'Aderyn',
 'Clare',
 'Tenley',
 'Emery',
 'Mavis',
 'Mansa',
 'Raeth',
 'Fenet',
 'Raunak',
 'Safia',
 'Myles',
 'Katherine',
 'Amarah',
 'Elloise',
 'Serena',
 'Abygale',
 'Raelene',
 'Sarah-Louise',
 'Naomi',
 'Chaeyun',
 'Easton',
 'Riyanshi',
 'Teegan',
 'Hazel',
 'Ailynn',
 'Iman',
 'Roselive',
 'Brionna',
 'Ari

In [69]:
len(pdfNames)

7311

In [70]:
all_names.extend(pdfNames)
all_names = list(set(all_names))
len(all_names)

16943

### Adding other files to the list

In [75]:
reader = PdfReader('Files/boyNames1.pdf') 
  
# creating a page object 
page = reader.pages[5] 

pdfNames = list() 
for i in range(0, len(reader.pages)): 
    page = reader.pages[i]
    s = page.extract_text().splitlines()[2:]
    s[0] = s[0][1:]
    for i in s:
        if " " in i and i[0].isalpha():
            pdfNames.append(i.split()[0])
    
pdfNames = list(set(pdfNames))
pdfNames

['Vincent-Emmanuel',
 'Robsan',
 'Arj',
 'Leovanni',
 'Johhan',
 'Elroy',
 'Milan',
 'Messer',
 'Jairuz',
 'Teja',
 'Bond',
 'Virasat',
 'Jhian',
 'Mickey',
 'Stephan',
 'Maliek',
 'Nahyan',
 'Barlow',
 'Aian',
 'Boh',
 'Obediah',
 'Anvith',
 'Ashwin',
 'Khari',
 'Harvir',
 'Evgeni',
 'Welson',
 'Etienne',
 'Iesus',
 'Cornelis',
 'Cedrin',
 'Anto',
 'Montryze',
 'Davi',
 'Xaiden',
 'Conner',
 'Mehdi',
 'Kassien',
 'Rayaan',
 'Kacey',
 'Kayven',
 'Renver',
 'Gaston',
 'Noah-James',
 'Kalvin',
 'Albi',
 'Shiwon',
 'Musawer',
 'Neel',
 'Karsyn',
 'Marwin',
 'Sorin',
 'Khobe',
 'Nixen',
 'Japraj',
 'Ryett',
 'Pahulveer',
 'Casen',
 'Emery',
 'Olana',
 'Bensyn',
 'Kodah',
 'Shelton',
 'Ezias',
 'Azayne',
 'Derri-Lee',
 'Townes',
 'Maveryk',
 'Myles',
 'Locke',
 'Erizander',
 'Soma',
 'Arkane',
 'Naetochukwu',
 'Krisshmeet',
 'Easton',
 'Darsnoor',
 'Anav',
 'Cru',
 'Maleek',
 'Hazel',
 'Iman',
 'Shayaan',
 'Kerby',
 'Roderick',
 'Raveesh',
 'Jean-Marc',
 'Dalvin',
 'Yousef',
 'Dimitrian',
 

In [76]:
len(pdfNames)

5850

In [77]:
all_names.extend(pdfNames)
all_names = list(set(all_names))
len(all_names)

22224

In [83]:
reader = PdfReader('Files/girlNames1.pdf') 
  
# creating a page object 
page = reader.pages[30] 

pdfNames = list() 
for i in range(0, len(reader.pages)): 
    page = reader.pages[i]
    s = page.extract_text().splitlines()[3:]
    for i in s:
        if " " in i and len(i.split())>1:
            pdfNames.append(i.split()[1])
    
pdfNames = list(set(pdfNames))
pdfNames

['Veda',
 'Kaviya',
 'Aubreah',
 'Barbra',
 'Lila',
 'Haniyah',
 'Milan',
 'Jhazzryn',
 'Tayiah',
 'Helina',
 'Khady',
 'Brookelynn',
 'Chloe-Lynn',
 'Tristyn',
 'Sasona-Anandi',
 'Layla-Lynn',
 'Karina',
 'Tshandapiwa',
 'Amyda',
 'Rileigh',
 'Sheza',
 'Jaeda',
 'Learnie',
 'Radhika',
 'Japleen',
 'Tienna',
 'Harvir',
 'Kyah',
 'Alissha',
 'Maraya',
 'Anayah',
 'Lorraine',
 'Nathalie',
 'Woodley',
 'Maleeha',
 'Keihaunna',
 'Tavia',
 'Kahlin',
 'Sreshta',
 'Asijha',
 'Whitlie',
 'Carina',
 'Manya',
 'Bezawit',
 'Conner',
 'Jazlynne',
 'Aryana',
 'Yazmina',
 'Daenesse',
 'Rayaan',
 'Kacey',
 'Albi',
 'Ryelle',
 'Widad',
 'Golnaz',
 'Armelle',
 'Karsyn',
 'LiLy',
 'Shana',
 'Tealya',
 'Zeren',
 'Cindyleah',
 'Ashmeen',
 'Maleyah',
 'Fathia',
 'Clare',
 'Tenley',
 'Seerit',
 'Emery',
 'Mavis',
 'Aminna',
 'Grace-Anok',
 'Hajrah',
 'Chaya',
 'Safia',
 'Katherine',
 'Locke',
 'Amarah',
 'Sonnet',
 'Nev',
 'Shaniell',
 'Lesly',
 'Serena',
 'Joy-Katrina',
 'Sharanya',
 'Janazarea',
 'Naomi',

In [84]:
len(pdfNames)

7011

In [85]:
all_names.extend(pdfNames)
all_names = list(set(all_names))
len(all_names)

26378

In [89]:
reader = PdfReader('Files/girlNames2.pdf') 
  
# creating a page object 
page = reader.pages[12] 

pdfNames = list() 
for i in range(0, len(reader.pages)): 
    page = reader.pages[i]
    s = page.extract_text().splitlines()[3:]
    for i in s:
        if " " in i and len(i.split())>1:
            word = i.split()[1]
            if 'Frequency' in word:
                word.replace('Frequency', '')
            pdfNames.append(word)
    
pdfNames = list(set(pdfNames))
pdfNames

# print(page.extract_text())

['Ala',
 'Veda',
 'Darrellee',
 'Lahilah',
 'Barbra',
 'Lila',
 'Lasemi',
 'Molly-Jean',
 'Haniyah',
 'Milan',
 'Vruddhi',
 'Gracey',
 'Aarayna',
 'Khady',
 'Thorunn',
 'Tristyn',
 'Klohie',
 'Jilliane',
 'Taijah',
 'Karina',
 'Kavnoor',
 'Rileigh',
 'Riddhi',
 'Jaeda',
 'Rosalinda',
 'Faramade',
 'Amarynn',
 'Nihad',
 'Japleen',
 'Sova',
 'Jadé',
 'Agoot',
 'Jerah',
 'Zaiya',
 'Gilana',
 'Jacelle',
 'Arasay',
 'Kyah',
 'Alissha',
 'Caitlove',
 'Hara',
 'Anayah',
 'Maraya',
 'Lorraine',
 'Nathalie',
 'Catherine-Yosef',
 'Maleeha',
 'Cyri',
 'Aaria',
 'Prestyn',
 'Tavia',
 'Sreshta',
 'Devynn',
 'Leighann',
 'Carina',
 'Manya',
 'Shay-Lee',
 'Bezawit',
 'Asees',
 'Anneska',
 'Aryana',
 'Kacey',
 'Valora',
 'Kory',
 "Sy'Rai",
 'Karsyn',
 'Nixen',
 'Mical',
 'Maleyah',
 'Clare',
 'Tenley',
 'Seerit',
 'Emery',
 'Mavis',
 'Griffiella',
 'Hajrah',
 'Chaya',
 'Safia',
 'Maykaella',
 'Katherine',
 'Amarah',
 'Kiret',
 'Katherina',
 'Karielle',
 'Ricki',
 'Aval',
 'Raelene',
 'Thanusska',
 'Na

In [90]:
len(pdfNames)

7510

In [91]:
all_names.extend(pdfNames)
all_names = list(set(all_names))
len(all_names)

30008

## Adding all baby names to a text file

In [92]:
with open("Files/baby_names_final2.txt", "w") as file:
    for name in all_names:
        file.write(name + "\n")