In [3]:
# STEP ONE: EXTRACTING A LIST OF BS OBJECTS TO SCRAPE
import requests
from bs4 import BeautifulSoup
import time

# CONSTRUCTING LINKS THAT LEADS TO THE TARGET COMMANDERS
"""A quick look at few links shows that they range from 'https://uboat.net/men/commanders/1.html'
to https://uboat.net/men/commanders/1411.html
We can create these links with the code below"""

base_url='https://uboat.net'
commander_links=[]
for i in range(1, 1412):  # 1412 because the range is exclusive at the end
        commander_url = f"{base_url}/men/commanders/{i}.html" 
        commander_links.append(commander_url)
# GET LIST OF BEAUTIFULSOUP OBJECTS
def getpage(url_link):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
    }
    all_bs = []  # List to hold all BeautifulSoup objects
    for url in url_link:  # Use the batch-specific URL list
        attempts = 0
        success = False  # Flag to track successful requests

        while attempts < 10:  # Retry up to 10 times
            try:
                response = requests.get(url, headers=headers, timeout=10)  # Set a timeout
                response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)
                bs = BeautifulSoup(response.text, 'html.parser')
                all_bs.append(bs)
                success = True  # Mark success
                time.sleep(2) # Ethical scraping, no exhausting the server
                break  # Break after a successful request
            except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
                attempts += 1
                wait_time = min(60, 2 ** attempts)
                print(f"Connection error occurred: {e}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            except requests.exceptions.RequestException as e:
                attempts += 1  # Increment attempts for any failed request
                print(f"Request failed: {e}. Retrying in 1 second...")
                time.sleep(1)

        if not success:  # If the request was not successful after all attempts
            print(f"Failed to scrape {url} after {attempts} attempts.")
            failed_urls.append(url)
            all_bs.append(None)

    return all_bs
bs_list=getpage(commander_links)

# Print the success message
None_found=False
for value in bs_list:
    if value is None:
        print("None value is detected")
        None_found=True
        continue
if None_found is False:
    print("bs_list printed successfully and no value is None")
    


Connection error occurred: HTTPSConnectionPool(host='uboat.net', port=443): Read timed out. (read timeout=10). Retrying in 2 seconds...
Connection error occurred: HTTPSConnectionPool(host='uboat.net', port=443): Read timed out. (read timeout=10). Retrying in 2 seconds...
Connection error occurred: HTTPSConnectionPool(host='uboat.net', port=443): Read timed out. (read timeout=10). Retrying in 2 seconds...
Connection error occurred: HTTPSConnectionPool(host='uboat.net', port=443): Read timed out. (read timeout=10). Retrying in 2 seconds...
Connection error occurred: HTTPSConnectionPool(host='uboat.net', port=443): Read timed out. (read timeout=10). Retrying in 2 seconds...
bs_list printed successfully and no value is None


In [31]:
#STEP 2: EXTRACTING THE DESIRED CONTENTS
# Lists to store desired data
Name = []
Born = []
Died = []
Ranks = []
Decorations = []
bs_list # the list of bs objects produced by the code above

def getcontent():
    for bs in bs_list:
        # Extract Name
        name = bs.find_all('h1', class_='')
        if name:
            Name.append(name[0].get_text(strip=True))
        else:
            Name.append("Not Given")

        # Extract Date of Birth
        birthtag = bs.find_all('td', string=re.compile(r'^Born'))
        if birthtag:
            Born.append(birthtag[0].next_sibling.get_text(strip=True))
        else:
            Born.append("Not Given")

        # Extract Date of Death
        diedtag = bs.find_all('td', string=re.compile(r'^Died'))
        if diedtag:
            if diedtag[0].next_sibling:
                Died.append(diedtag[0].next_sibling.get_text(strip=True))
            else:
                Died.append("Not Given")
        else:
            Died.append("Not Given")

        # Extract Ranks
        ranks_table = bs.find('td', {'class': 'width400'})
        if ranks_table:
            def filter_text(element):
                text = element.get_text(separator=" ", strip=True)
                return text.replace('\xa0', '').strip()

            final_text = filter_text(ranks_table)
            text_before_decorations = final_text.split("Decorations")[0]

            if text_before_decorations:
                pattern = r'(\d{1,2} \w+ \d{4})\s+([^\d]+)'
                matches = re.findall(pattern, text_before_decorations)
                if matches:
                    Ranks.append(matches)
                else:
                    Ranks.append(["No Rank"])
            else:
                Ranks.append(["No Rank"])
        else:
            Ranks.append(["No Rank"])

        # Extract Decorations
        if ranks_table:
            text_after_decorations = final_text.split("Decorations")[1] if "Decorations" in final_text else ""
            if text_after_decorations:
                pattern = r'(\d{1,2} \w+ \d{4})\s+([^\d]+)'
                matches = re.findall(pattern, text_after_decorations)
                if matches:
                    Decorations.append(matches)
                else:
                    Decorations.append(["No Decoration"])
            else:
                Decorations.append(["No Decoration"])
        else:
            Decorations.append(["No Decoration"])

getcontent() # Initializing data mining by calling the function


In [40]:
#STEP THREE:  FORMAT/CLEAN THE DATA FOR BETTER PRINTING

# Find the maximum length to pad the ranks and decorations lists
max_len = max(max(len(Ranks[i]), len(Decorations[i])) for i in range(len(Ranks)))

# Pad the shorter lists with None
for i in range(len(Ranks)):
    while len(Ranks[i]) < max_len:
        Ranks[i].append(None)
    while len(Decorations[i]) < max_len:
        Decorations[i].append(None)

# Update the dataframe with padded lists, including Name, Born, and Died
df_padded = pd.DataFrame({
    'Name of Commander': Name,
    'Born': Born,
    'Died': Died,
    'Ranks': Ranks,
    'Decorations': Decorations
})

# Explode the dataframe so that each row contains individual rank and decoration entries
df_exploded = df_padded.explode(['Ranks', 'Decorations']).reset_index(drop=True)
df_exploded.set_index("Name of Commander", inplace=True)
df_exploded.dropna(thresh=len(df_exploded.columns) - 1, inplace=True) #drops all rows with na>=2


# Group by name and aggregate the ranks and decorations into lists
df_grouped = df_exploded.groupby(['Name of Commander', 'Born', 'Died']).agg({
    'Ranks': lambda x: list(x.dropna().unique()),  # Aggregate unique ranks into lists
    'Decorations': lambda x: list(x.dropna().unique())  # Aggregate unique decorations into lists
}).reset_index()

# Now, create subcolumns for Ranks and Decorations
max_rank_len = df_grouped['Ranks'].apply(len).max()  # Find maximum length of ranks
max_decoration_len = df_grouped['Decorations'].apply(len).max()  # Find maximum length of decorations

# Create new DataFrame with subcolumns
df_final = pd.DataFrame({
    'Name of Commander': df_grouped['Name of Commander'],
    'Born': df_grouped['Born'],
    'Died': df_grouped['Died']
})

# Add subcolumns for ranks
for i in range(max_rank_len):
    df_final[f'Rank_{i+1}'] = df_grouped['Ranks'].apply(lambda x: x[i] if i < len(x) else None)

# Add subcolumns for decorations
for i in range(max_decoration_len):
    df_final[f'Decoration_{i+1}'] = df_grouped['Decorations'].apply(lambda x: x[i] if i < len(x) else None)

# Print the final DataFrame with subcolumns
print("\nFinal DataFrame with Subcolumns:")
print(df_final)

# Optional: Save to Excel
df_final.to_excel(r"E:\PYTHON- DATA SCIENCE\Data hub\U-BOAT COMMANDERS\Sraped U-boat Commanders.xlsx", index=False)


Final DataFrame with Subcolumns:
                             Name of Commander         Born         Died  \
0                            Adalbert Schmandt  26 Dec 1909         1958   
1                              Adalbert Schnee  31 Dec 1913   4 Nov 1982   
2                      Adolf Cornelius Piening  16 Sep 1910  15 May 1984   
3                                Adolf Dumrese  13 Nov 1909  24 Mar 1942   
4                             Adolf Friedrichs   4 Mar 1914  25 Sep 1942   
...                                        ...          ...          ...   
1406                         Wolfgang Strenger   9 Feb 1919    Not Given   
1407                          Wolfgang Sträter  21 May 1916  29 Jul 1943   
1408                           Wolfgang Wenzel  29 Mar 1910  14 Apr 1992   
1409                    Wolfgang von Eickstedt   1 Dec 1915   5 Apr 1988   
1410  Wolfgang-Friedrich Freiherr von Forstner   3 Oct 1916  24 Sep 1999   

                                               Rank_1