In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

In [8]:
def generate_year_urls(start_year=1975, end_year=2025):
  return [f'https://ncics.org/ibtracs/index.php?name=YearBasin-{year}' for year in range(start_year, end_year + 1)]

In [9]:
def get_western_pacific_links(year_url):
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
  response = requests.get(year_url, headers=headers)
  soup = BeautifulSoup(response.content, 'html.parser')
  tables = soup.find_all('table', class_='ishade')

  if len(tables) >= 2:
    table = tables[1]
    rows = table.find_all('tr')
    western_pacific_col_index = None

    if rows:
      header_cells = rows[0].find_all(['td', 'th'])
      for idx, cell in enumerate(header_cells):
        cell_text = cell.get_text(strip=True).upper()
        if 'WESTERN' in cell_text and 'PACIFIC' in cell_text:
          western_pacific_col_index = idx
          break
    
    if western_pacific_col_index is None:
      return [] 

    western_pacific_links = []

    for row in rows[1:]:
      cells = row.find_all('td')

      if len(cells) > western_pacific_col_index:
        western_pacific_cell = cells[western_pacific_col_index]
        link = western_pacific_cell.find('a', href=True)
        
        if link:
          href = link['href']
          if not href.startswith('http'):
            href = f'https://ncics.org/ibtracs/{href}'
          
          western_pacific_links.append(href)

    return western_pacific_links
  else:
    return []

In [10]:
def get_link_from_web(year_urls):
  all_links = []
  
  for year_url in year_urls:
    links = get_western_pacific_links(year_url)
    all_links.extend(links)
  
  return all_links

In [11]:
def get_data_from_web(urls):
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
  all_df = []

  for url in urls:
    print(f'Crawling: {url}')
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
      print(f'Error: {r.status_code}')
      continue

    soup = BeautifulSoup(r.content, 'html.parser')

    storm_id = None
    storm_id_element = soup.find('td', string=' Storm ID')
    if storm_id_element and storm_id_element.find_next_sibling('td'):
      storm_id = storm_id_element.find_next_sibling('td').get_text(strip=True)

    tables = soup.find_all('table')
    df = None

    for table in tables:
      if 'SEASON' in str(table):
        df = pd.read_html(str(table))[0]
        break

    if df is not None:
      if storm_id:
        df.insert(0, 'STORM ID', storm_id)
      else:
        df.insert(0, 'STORM ID', 'Unknown')

      if 'SEASON' in df.columns:
        df = df[pd.to_numeric(df['SEASON'], errors='coerce').notna()]
        df = df.reset_index(drop=True)

      all_df.append(df)
      print(f'Crawling completed: {df.shape[0]} rows')
    else:
      print('No IBTrACS data table found')

  if all_df:
    final_df = pd.concat(all_df, ignore_index=True)
    print(f'Total rows after merging: {final_df.shape[0]}')
    return final_df
  else:
    print('No data to return')
    return pd.DataFrame()

In [None]:
year_urls = generate_year_urls(2000, 2025)
urls = get_link_from_web(year_urls)
df = get_data_from_web(urls)
df.to_csv('../data/ibtracs_1.csv', index=False)

Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000125N06136
Crawling completed: 105 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000138N16119
Crawling completed: 57 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000141N18116
Crawling completed: 19 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000150N11117
Crawling completed: 31 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000169N17114
Crawling completed: 17 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000184N13133
Crawling completed: 67 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000185N15117
Crawling completed: 77 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000193N10131
Crawling completed: 75 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000196N16122
Crawling completed: 11 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-2000199N20145
Crawling completed: 51 rows

In [None]:
year_urls = generate_year_urls(1975, 1999)
urls = get_link_from_web(year_urls)
df = get_data_from_web(urls)
df.to_csv('../data/ibtracs_2.csv', index=False)

Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975019N09140
Crawling completed: 77 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975111N05132
Crawling completed: 57 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975166N16116
Crawling completed: 39 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975184N18145
Crawling completed: 21 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975205N18143
Crawling completed: 53 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975211N20137
Crawling completed: 73 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975216N18132
Crawling completed: 27 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975217N20126
Crawling completed: 21 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975221N18116
Crawling completed: 133 rows
Crawling: https://ncics.org/ibtracs/index.php?name=v04r01-1975221N18127
Crawling completed: 69 rows

In [2]:
files = ['../data/ibtracs_2.csv', '../data/ibtracs_1.csv']

dfs = [pd.read_csv(f) for f in files]
df = pd.concat(dfs, ignore_index=True)
df = df.reset_index(drop=True)
df.to_csv('../data/ibtracs.csv', index=False)