In [38]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

# 20. Wahlperiode

In [39]:
# Fetch the Wikipedia page
path = 'https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(20._Wahlperiode)'
def fetch_qikipedia_page(path):
    url = path
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

In [40]:
soup = fetch_qikipedia_page(path)

# Fraktionsvorstände

In [41]:
def find_element(id):
    element_with_id = soup.find(id=id)
    return element_with_id

In [42]:
element_with_id = find_element('Fraktionsvorstände')

In [43]:
def get_table(element_with_id):
    if element_with_id:
        table = element_with_id.find_next('table', {'class': 'wikitable'})
        if table:
            return table
        else:
            print("No table found after the element with the given id.")
    else:
        print("Element with id 'Fraktionsvorstände' not found.")

In [44]:
table = get_table(element_with_id)

In [45]:
# Parse the table headers
def get_headers(table):
    headers = [th.get_text(strip=True) for th in table.findAll('th')]
    return headers

In [46]:
headers = get_headers(table)

In [47]:
# Initiate list to hold row data
def scrape_table_1(table, headers):
    table_rows = []
    # Regex pattern to match dates and the word 'bis'
    unwanted_pattern = re.compile(r'bis|\d+\.\s*[A-Za-z]+\s*\d{4}')
    # Iterate over the rows of the table
    for row in table.findAll('tr')[1:]:
        columns = row.findAll('td')
        row_data = [re.sub(r'\[\d+\]', '', td.get_text(strip=True)) for td in columns]
        row_data = [re.sub(r'\(\d+\)', '', td.get_text(strip=True)) for td in columns]
        # Check if the data matches the unwanted pattern
        if not any(unwanted_pattern.search(data) for data in row_data):
            table_rows.append(row_data)
    
    # Create the DataFrame using the headers and the table rows
    df = pd.DataFrame(table_rows, columns=headers)
    
    return df


In [48]:
df = scrape_table_1(table, headers)

In [49]:
# Clean up dataframe
# Drop () and content of it
# Remove everything inside brackets for the specified columns
def clean_df(df):

    cols_to_modify = ['Vorsitzende' ,'Stellvertretende Vorsitzende', 'Parlamentarische Geschäftsführer']
    
    for col in cols_to_modify:
        df[col] = df[col].str.replace(r'\(.*?\)', '', regex=True).str.strip()
    
    # inset comma and whitespace between names for later explosion
    for col in cols_to_modify:
        df[col] = df[col].apply(lambda x: re.sub(r'([a-z])([A-Z])', r'\1, \2', x) if x else x)
    
    return df

In [50]:
df = clean_df(df)

In [51]:
def melt_df(df):
    df_melted = df.melt(id_vars=['Fraktion'], value_vars=['Vorsitzende', 'Stellvertretende Vorsitzende', 'Parlamentarische Geschäftsführer'], 
                    var_name='Position', value_name='Name')
    # Splitting the names and expanding into rows
    df_melted = df_melted.assign(Name=df_melted['Name'].str.split(',')).explode('Name')
    # Resetting the index
    df_melted.reset_index(drop=True, inplace=True)
    # Cleaning up the name column (removing excess white spaces)
    df_melted['Name'] = df_melted['Name'].str.strip()
    
    return df_melted

In [52]:
df_melted = melt_df(df)

In [53]:
df_melted

Unnamed: 0,Fraktion,Position,Name


# Getting the Abgeprdneten

In [54]:
def scrape_table_2(table, headers):
    table_rows = []
    
    # Iterate over the rows of the table
    for row in table.findAll('tr')[1:]:
        columns = row.findAll('td')
        row_data = [re.sub(r'\[\d+\]', '', td.get_text(strip=True)) for td in columns]
        table_rows.append(row_data)
    
    # Create the DataFrame using the headers and the table rows
    df = pd.DataFrame(table_rows, columns=headers)
    
    return df

In [55]:
def clean_table_2(df):
    # Get columns that contain 'Name' or 'Fraktion', case insensitive
    columns_to_keep = [col for col in df.columns if 'name' in col.lower() or 'fraktion' in col.lower()]

    # Filter the dataframe to only keep those columns
    df = df[columns_to_keep]

    # If needed, rename the column with 'Fraktion' to just 'Fraktion'
    for col in df.columns:
        if 'fraktion' in col.lower():
            df = df.rename(columns={col: 'Fraktion'})
    
    return df

In [56]:
def clean_before_19(df):
    columns_to_keep = [col for col in df.columns if 'mitglied' in col.lower() or 'partei' in col.lower()]
     # Filter the dataframe to only keep those columns
    df = df[columns_to_keep]

    # If needed, rename the column with 'Fraktion' to just 'Fraktion'
    for col in df.columns:
        if 'partei' in col.lower():
            df = df.rename(columns={col: 'Fraktion'})
        elif 'mitglied' in col.lower():
            df = df.rename(columns={col: 'Name'})
    
    return df

In [57]:
def add_position(df):
    df['Position'] = 'Abgeordnete*r'
    return df

In [58]:
element_with_id = find_element('Abgeordnete')
table = get_table(element_with_id)
headers = get_headers(table)
df_2 = scrape_table_2(table, headers)
df_2 = clean_table_2(df_2)
df_2 = add_position(df_2)
df_2 = df_2[['Fraktion', 'Position', 'Name']]

# Combining Vorsitzende with Abgeprdnete

In [59]:
def combine_dataframes(df1, df2):
    result = pd.concat([df1, df2], ignore_index=True)
    return result

In [60]:
final_df_20 = combine_dataframes(df_melted, df_2)

In [61]:
final_df_20['Wahlperiode'] = 20

# Lets do this for the other Whlperioden

In [62]:
## Wahlperiod 19.
path = 'https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(19._Wahlperiode)'

In [63]:
soup = fetch_qikipedia_page(path)

In [64]:
element_with_id = find_element('Fraktionsvorstände')
table = get_table(element_with_id)
headers = get_headers(table)
df = scrape_table_1(table, headers)
df = clean_df(df)
df_melted = melt_df(df)

In [65]:
element_with_id = find_element('Abgeordnete')
table = get_table(element_with_id)
headers = get_headers(table)
df_2 = scrape_table_2(table, headers)
df_2 = clean_table_2(df_2)
df_2 = add_position(df_2)
df_2 = df_2[['Fraktion', 'Position', 'Name']]

In [66]:
final_df_19 = combine_dataframes(df_melted, df_2)

In [67]:
final_df_19['Wahlperiode'] = 19

# Wahlperiode 18

In [68]:
## Wahlperiod 18.
path = 'https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(18._Wahlperiode)'
soup = fetch_qikipedia_page(path)
# element_with_id = find_element('Fraktionsvorstände')
# table = get_table(element_with_id)
# headers = get_headers(table)
# df = scrape_table_1(table, headers)
# df = clean_df(df)
# df_melted = melt_df(df)
element_with_id = find_element('Abgeordnete')
table = get_table(element_with_id)
headers = get_headers(table)
df_2 = scrape_table_2(table, headers)
df_2 = clean_before_19(df_2)
df_2 = add_position(df_2)
df_2 = df_2[['Fraktion', 'Position', 'Name']]
final_df_18 = df_2.copy()
#final_df_18 = combine_dataframes(df_melted, df_2)
final_df_18['Wahlperiode'] = 18

# Wahlperiode 17

In [69]:
## Wahlperiod 17.
path = 'https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(17._Wahlperiode)'
soup = fetch_qikipedia_page(path)
# element_with_id = find_element('Fraktionsvorstände')
# table = get_table(element_with_id)
# headers = get_headers(table)
# df = scrape_table_1(table, headers)
# df = clean_df(df)
# df_melted = melt_df(df)
element_with_id = find_element('Abgeordnete')
table = get_table(element_with_id)
headers = get_headers(table)
df_2 = scrape_table_2(table, headers)
df_2 = clean_before_19(df_2)
df_2 = add_position(df_2)
df_2 = df_2[['Fraktion', 'Position', 'Name']]
final_df_17 = df_2.copy()
#final_df_18 = combine_dataframes(df_melted, df_2)
final_df_17['Wahlperiode'] = 17

# Wahlperiode 16

In [70]:
## Wahlperiod 18.
path = 'https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(16._Wahlperiode)'
soup = fetch_qikipedia_page(path)
# element_with_id = find_element('Fraktionsvorstände')
# table = get_table(element_with_id)
# headers = get_headers(table)
# df = scrape_table_1(table, headers)
# df = clean_df(df)
# df_melted = melt_df(df)
element_with_id = find_element('Abgeordnete')
table = get_table(element_with_id)
headers = get_headers(table)
df_2 = scrape_table_2(table, headers)
df_2 = clean_before_19(df_2)
df_2 = add_position(df_2)
df_2 = df_2[['Fraktion', 'Position', 'Name']]
final_df_16 = df_2.copy()
#final_df_18 = combine_dataframes(df_melted, df_2)
final_df_16['Wahlperiode'] = 16

# Wahlperiode 15

In [71]:
## Wahlperiod 18.
path = 'https://de.wikipedia.org/wiki/Liste_der_Mitglieder_des_Deutschen_Bundestages_(15._Wahlperiode)'
soup = fetch_qikipedia_page(path)
# element_with_id = find_element('Fraktionsvorstände')
# table = get_table(element_with_id)
# headers = get_headers(table)
# df = scrape_table_1(table, headers)
# df = clean_df(df)
# df_melted = melt_df(df)
element_with_id = find_element('Abgeordnete')
table = get_table(element_with_id)
headers = get_headers(table)
df_2 = scrape_table_2(table, headers)
df_2 = clean_before_19(df_2)
df_2 = add_position(df_2)
df_2 = df_2[['Fraktion', 'Position', 'Name']]
final_df_15 = df_2.copy()
#final_df_18 = combine_dataframes(df_melted, df_2)
final_df_15['Wahlperiode'] = 15

# Final Dataframe of all parties

In [72]:
df_parties = pd.concat([final_df_20, final_df_19, final_df_18, final_df_17, final_df_16, final_df_15], ignore_index=True)
df_parties

Unnamed: 0,Fraktion,Position,Name,Wahlperiode
0,SPD,Abgeordnete*r,Sanae Abdi,20
1,FDP,Abgeordnete*r,Valentin Abel,20
2,CDU/CSU (CDU),Abgeordnete*r,Knut Abraham,20
3,FDP,Abgeordnete*r,Katja Adler,20
4,Grüne,Abgeordnete*r,Stephanie Aeffner,20
...,...,...,...,...
4051,CSU,Abgeordnete*r,Wolfgang Zeitlmann,15
4052,CSU,Abgeordnete*r,Wolfgang Zöller,15
4053,SPD,Abgeordnete*r,Manfred Zöllmer,15
4054,SPD,Abgeordnete*r,Christoph Zöpel,15


In [73]:
df_parties.to_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/Final_Data/parties.csv', index=False)