# Web Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import socket


In [None]:
def get_synonyms(ingredient):
    try:
        sample_web_page = f'https://webbook.nist.gov/cgi/cbook.cgi?Name={ingredient}&Units=SI'

        # Retry mechanism with increased timeout and handling DNS resolution issues
        max_retries = 3
        for attempt in range(max_retries):
            try:
                page = requests.get(sample_web_page, timeout=30)  # Increased timeout
                page.raise_for_status()
                break  # Break the loop if successful
            except (requests.RequestException, socket.gaierror) as e:
                print(f"Error retrieving data for {ingredient}: {e}")
                print(f"URL: {sample_web_page}")
                if attempt < max_retries - 1:
                    print(f"Retrying... (Attempt {attempt + 2})")
                    time.sleep(10)  # Adjust the delay time as needed
                else:
                    print("Max retries exceeded. Skipping.")
                    return None

        soup = BeautifulSoup(page.content, "html.parser")

        child_soup = soup.find_all('li')
        text = 'Other names:'

        synonyms = None
        for child in child_soup:
            if text in child.text:
                synonyms = child.text.replace(text, '').strip()

        return synonyms
    except requests.RequestException as e:
        print(f"Error retrieving data for {ingredient}: {e}")
        return None



In [None]:
# Read the existing DataFrame from a CSV file
input_file = 'Synonyms_Full_No_Duplicate.csv'
output_file = 'SynonymsFull_Web_Scaping.csv'

new_df2 = pd.read_csv(input_file)



In [None]:
for index, row in new_df2.iterrows():
    if pd.isna(row['Synonym']):
        ingredient = row['Name']
        synonyms = get_synonyms(ingredient)
        new_df2.at[index, 'Synonym'] = synonyms

        # Add a delay between requests to avoid being blocked
        time.sleep(2)  # Adjust the delay time as needed

In [None]:
# Save the updated DataFrame to a new CSV file
new_df2.to_csv(output_file, index=False)