In [1]:
# Import dependencies
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import randint

In [2]:
# Scraping the different loss records in our data source: aviationarchaeology.com. All USAF losses.
page = 1
records = []
while page != 1607:
    aair_url = f"https://www.aviationarchaeology.com/rptVietnam.asp?RecID={page}"
    response = requests.get(aair_url)
    html = response.content
    soup = BeautifulSoup(html, "lxml")
    for tb in soup.find_all('table'):
        records.append(tb.get_text(strip=True))
    sleep(randint(2,5))
    page = page + 1


In [3]:
# Checking that our data contained as a list.
print(type(records))

<class 'list'>


In [4]:
# Creating a dataframe of the raw data
usaf_df = pd.DataFrame(records, columns = ["All_Info"])
usaf_df.head()

Unnamed: 0,All_Info
0,Crash Date:620202Crash Time:_Aircraft Type:C-1...
1,Crash Date:620211Crash Time:_Aircraft Type:SC-...
2,Crash Date:620613Crash Time:_Aircraft Type:T-2...
3,Crash Date:620828Crash Time:_Aircraft Type:T-2...
4,Crash Date:621015Crash Time:_Aircraft Type:U-1...


In [5]:
# Exporting raw data to csv. So we don't have to run a large scrape again each time we open & update our ETL/data cleaning file.
usaf_df.to_csv('Vietnam_USAF_Losses_Raw_Data.csv', index=False)

### Scrape of second website. This contains some additional information we can use as context.

In [12]:
# Scraped summary page with all listings in case we need for info like full base name, etc.
fixedwing_url = 'http://vietnamairlosses.com/name_search_list.php'
fixedwing_losses = []

response = requests.get(fixedwing_url)
html = response.content
soup = BeautifulSoup(html, "lxml")
for li in soup.find_all('li'):
    fixedwing_losses.append(li.get_text(strip=True))

In [13]:
# Checking that our data contained as a list.
print(type(fixedwing_losses))

<class 'list'>


In [14]:
# Creating a dataframe of the raw data
fixedwing_df = pd.DataFrame(fixedwing_losses, columns = ["All_Info"])
fixedwing_df.head()

Unnamed: 0,All_Info
0,23 March 1961: C-47B Skytrain; Ser # 44-76330...
1,2 February 1962: C-123B Provider; Ser # 56-43...
2,11 February 1962: SC-47A Skytrain; Ser # 43-1...
3,20 April 1962: C-123B Provider; Ser # 56-4368...
4,15 July 1962: C-123B Provider; Ser # 56-4366;...


In [15]:
# Exporting raw data to csv. So we don't have to run a large scrape again each time we open & update our ETL/data cleaning file.
fixedwing_df.to_csv('Vietnam_All_Fixedwing_Losses_Raw_Data.csv', index=False)