# Imports

In [4]:
import pandas as pd
import re

In [5]:
df_speaker = pd.read_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/Final_DF/final_text_speaker_df.csv', encoding='utf-8')
df_parties = pd.read_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/Parties_members/parties.csv')

## Unique Speaker per Election_Period Dataframe

In [13]:
df_parties.rename(columns={'Wahlperiode':'Election_Period'}, inplace = True)

In [6]:
df_speaker.shape

(4864724, 8)

In [14]:
# Drop duplicate rows to get unique combinations of 'Election_Period' and 'Speaker'
unique_speakers_per_wahlperiode = df_speaker[['Election_Period', 'Speaker']].drop_duplicates()

In [15]:
unique_speakers_per_wahlperiode

Unnamed: 0,Election_Period,Speaker
0,15,Alterspräsident Otto Schily
38,15,Franz Müntefering
82,15,Wolfgang Thierse
86,15,Präsident Wolfgang Thierse
143,15,Eugen Gerstenmaier
...,...,...
4856707,20,Ersten Gesetzes
4856709,20,Mehr Mitbestimmung der Patientinnen
4860897,20,Lobby RG) – Geldflüsse offenlegen und
4861077,20,gesetz militärisches Personal – Mil Pers Glei


In [16]:
unique_speakers = df_speaker['Speaker'].drop_duplicates()

In [17]:
unique_speakers

0                            Alterspräsident Otto Schily
38                                     Franz Müntefering
82                                      Wolfgang Thierse
86                            Präsident Wolfgang Thierse
143                                   Eugen Gerstenmaier
                               ...                      
4856009      Keine Unterstützung von Schlepperei, Schleu
4856709              Mehr Mitbestimmung der Patientinnen
4860897            Lobby RG) – Geldflüsse offenlegen und
4861077    gesetz militärisches Personal – Mil Pers Glei
4864229            Bezahlbaren Strom sichern – Industrie
Name: Speaker, Length: 6617, dtype: object

# Check for overlaying Names in Speaker and Party Members

In [18]:
# Create empty columns for 'Name' and 'Fraktion' in unique_speakers_per_wahlperiode
unique_speakers_per_wahlperiode['Name'] = None
unique_speakers_per_wahlperiode['Fraktion'] = None    

In [19]:
# Iterate over each row in unique_speakers_per_wahlperiode
for idx, row in unique_speakers_per_wahlperiode.iterrows():
    # Find members in the same Election_Period
    members_in_same_wahlperiode = df_parties[df_parties['Election_Period'] == row['Election_Period']]
    
    # Iterate over the members DataFrame to find a partial match
    for _, member_row in members_in_same_wahlperiode.iterrows():
        # Check if the speaker name contains the member name
        if member_row['Name'].lower() in row['Speaker'].lower():
            # If a match is found, update the 'name' and 'Fraktion' columns
            unique_speakers_per_wahlperiode.at[idx, 'Name'] = member_row['Name']
            unique_speakers_per_wahlperiode.at[idx, 'Fraktion'] = member_row['Fraktion']
            break  # Assuming only the first match is needed

In [20]:
# Iterate over each row in unique_speakers_per_wahlperiode
for idx, speaker_row in unique_speakers_per_wahlperiode.iterrows():
    # Filter members from df_parties who are in the same Election_Period as the current speaker
    members_in_same_wahlperiode = df_parties[df_parties['Election_Period'] == speaker_row['Election_Period']]
    
    # Track if a match is found to avoid unnecessary iterations
    match_found = False
    
    # Iterate over the filtered members DataFrame
    for _, member_row in members_in_same_wahlperiode.iterrows():
        # Check if the member name is in the speaker name (case-insensitive)
        if member_row['Name'].lower() in speaker_row['Speaker'].lower():
            # Update the 'Name' and 'Fraktion' columns with the member's information
            unique_speakers_per_wahlperiode.at[idx, 'Name'] = member_row['Name']
            unique_speakers_per_wahlperiode.at[idx, 'Fraktion'] = member_row['Fraktion']
            match_found = True
            break  # Stop after the first match
    
    # If no match was found in the same Election_Period, search the entire df_parties
    if not match_found:
        for _, member_row in df_parties.iterrows():
            if member_row['Name'].lower() in speaker_row['Speaker'].lower():
                # Update with the matched name and Fraktion, regardless of Election_Period
                unique_speakers_per_wahlperiode.at[idx, 'Name'] = member_row['Name']
                unique_speakers_per_wahlperiode.at[idx, 'Fraktion'] = member_row['Fraktion']
                break  # Stop after the first match

In [21]:
unique_speakers_per_wahlperiode.isna().sum()

Election_Period       0
Speaker               0
Name               5035
Fraktion           5035
dtype: int64

In [22]:
unique_speakers_per_wahlperiode.shape

(9484, 4)

In [23]:
unique_speakers_per_wahlperiode.sample(10)

Unnamed: 0,Election_Period,Speaker,Name,Fraktion
558058,16,Franz Josef Holzenkamp,,
1705004,17,Neuen „Krippengipfel“ einberufen – Ausbau,,
4311761,20,Elisabeth Kaiser,Elisabeth Kaiser,SPD
1027210,16,Hermes Bürgschaft für das Ilisu Staudamm,,
3772933,19,Schutz vor Masern und zur Stärkung der,,
2526208,18,Wolfgang Bosbach,Wolfgang Bosbach,CDU
4345002,20,Helmut Kleebank,Helmut Kleebank,SPD
10139,15,Kofi Annans,,
4556681,20,Der Mittelstand ist systemrelevant – Regie,,
3720021,19,Drohenden Kollaps verhindern – Deut,,


In [24]:
# Let us try it another way and check which works better
# Iterate over each row in unique_speakers_per_wahlperiode
for idx, speaker_row in unique_speakers_per_wahlperiode.iterrows():
    speaker_name_regex = re.escape(speaker_row['Speaker']).replace(r'\ ', r'\W*')
    # Filter the df_parties DataFrame for rows where the 'Name' column contains the speaker's name, case-insensitively
    filtered_df = df_parties[df_parties['Name'].str.contains(speaker_name_regex, na=False, case=False, regex=True)]

    # Check the number of entries in the filtered dataframe
    if len(filtered_df) > 1:
        # Check Election_Period if there is more than one entry
        # Assuming 'Election_Period' is a column in both dataframes and we want to match it with the speaker's 'Election_Period'
        correct_wahlperiode = filtered_df[filtered_df['Election_Period'] == speaker_row['Election_Period']]
        # If there's exactly one entry with the correct Election_Period, use that entry
        if len(correct_wahlperiode) == 1:
            # Update the 'Name' and 'Fraktion' columns with the member's information
            unique_speakers_per_wahlperiode.at[idx, 'Name'] = correct_wahlperiode.iloc[0]['Name']
            unique_speakers_per_wahlperiode.at[idx, 'Fraktion'] = correct_wahlperiode.iloc[0]['Fraktion']
        else:
            # Handle the case where there's no match or multiple matches even within the same Election_Period
            # You could also add more sophisticated handling here if needed
            pass
    elif len(filtered_df) == 1:
        # If there's exactly one entry, use that entry
        unique_speakers_per_wahlperiode.at[idx, 'Name'] = filtered_df.iloc[0]['Name']
        unique_speakers_per_wahlperiode.at[idx, 'Fraktion'] = filtered_df.iloc[0]['Fraktion']
    else:
        # If no entries are found, handle the case as needed
        # You could set a default value or perform some other operation
        pass

In [25]:
unique_speakers_per_wahlperiode.isna().sum()

Election_Period       0
Speaker               0
Name               4641
Fraktion           4641
dtype: int64

In [26]:
unique_speakers_per_wahlperiode.shape

(9484, 4)

In [27]:
unique_speakers_per_wahlperiode.sample(10)

Unnamed: 0,Election_Period,Speaker,Name,Fraktion
514165,16,Jörg van Essen,Jörg van Essen,FDP
911133,16,sen – Kosten für Schulbedarfe abdecken,,
1859507,17,Ergebnis,,
4353807,20,Mario Czaja,Mario Czaja,CDU/CSU (CDU)
684129,16,Impfen statt Töten – Praxisreife Markerimpf,,
693320,16,umfassender Sicherheitsan,,
3283792,19,Mariana Iris Harder Kühnel,,
2383551,18,Klaus Peter Flosbach,Klaus-Peter Flosbach,CDU
4314764,20,Wolfgang Hellmich,Wolfgang Hellmich,SPD
3275092,19,Brigitte Freihold,Brigitte Freihold,Linke


In [28]:
# Only match partial names as well
# Iterate over each row in unique_speakers_per_wahlperiode
for idx, speaker_row in unique_speakers_per_wahlperiode.iterrows():
    # Extract the likely unique part of the speaker's name (in this case, the first name or a significant part of it)
    # Here, we're using split to just take the part after 'Dr.' if it's there and then splitting on spaces to get the first name.
    # We assume the format is always 'Dr. Firstname Lastname' if 'Dr.' is present.
    name_parts = speaker_row['Speaker'].replace('Dr.', '').strip().split()
    unique_name_part = name_parts[0] if name_parts else speaker_row['Speaker']
    
    # Prepare a regex pattern that matches the unique name part, case-insensitively
    name_regex = re.escape(unique_name_part) + r'\W*'

    # Filter df_parties for names that contain the regex pattern within the same Election_Period, case-insensitively
    filtered_df = df_parties[
        (df_parties['Name'].str.contains(name_regex, na=False, case=False, regex=True)) &
        (df_parties['Election_Period'] == speaker_row['Election_Period'])
    ]

    # Check the number of entries in the filtered dataframe
    if len(filtered_df) == 1:
        # If there's exactly one entry, use that entry
        matched_member = filtered_df.iloc[0]
        unique_speakers_per_wahlperiode.at[idx, 'Name'] = matched_member['Name']
        unique_speakers_per_wahlperiode.at[idx, 'Fraktion'] = matched_member['Fraktion']
    elif len(filtered_df) > 1:
        # If there are multiple entries, you might need additional logic to choose the correct one
        # Since you mentioned that 'Tobias' is unique in this case, this block may not be necessary
        # But it's here in case you need to handle multiple matches
        pass
    else:
        # No match found, you can decide how to handle this scenario
        pass

In [29]:
unique_speakers_per_wahlperiode.isna().sum()

Election_Period       0
Speaker               0
Name               4421
Fraktion           4421
dtype: int64

In [30]:
unique_speakers_per_wahlperiode.shape

(9484, 4)

In [31]:
unique_new = unique_speakers_per_wahlperiode.copy()

In [32]:
unique_new.shape

(9484, 4)

In [33]:
unique_new = unique_new.drop_duplicates(subset=['Speaker'])

In [34]:
# combine dataframes
# Concatenate df_speakers with unique_speakers_per_wahlperiode using an outer join to include all records
combined_df = df_speaker.merge(unique_new, on='Speaker', how='left')

In [35]:
combined_df.head()

Unnamed: 0,Election_Period_x,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions,Election_Period_y,Name,Fraktion
0,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Meine sehr verehrten Damen und sehr geehrten H...,,15,Otto Schily,SPD
1,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Von Paul Löbeüber Konrad Adenauerbis hin zu al...,,15,Otto Schily,SPD
2,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Nur Willy Brandtwar 1983 acht Monate jünger, a...",,15,Otto Schily,SPD
3,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Das Amt des Alterspräsiden ten blieb Willy Bra...,,15,Otto Schily,SPD
4,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Den Hinweis daraufsollten Sie, was meine Leben...",Heiterkeit bei der SPD und dem BÜND NIS 90/DIE...,15,Otto Schily,SPD


In [36]:
combined_df.shape

(4864724, 11)

In [37]:
combined_new = combined_df.copy()

In [38]:
combined_new.isna().sum()

Election_Period_x          0
Session                    0
Date                       0
Start                      0
End_Time                   0
Speaker                    0
Text_Spoken                0
Reactions            4105647
Election_Period_y          0
Name                  334112
Fraktion              334112
dtype: int64

In [39]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp_person = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [94]:
# missing_names = combined_df[combined_df['Name'].isna()]
# 
# # Initialize a counter for B-PER entities
# b_per_count = 0
# 
# for text in missing_names['Speaker']:
#     ner_results = nlp_person(text)
#     person_entities = [entity for entity in ner_results if entity['entity'] == 'B-PER']
#     b_per_count += len(person_entities)
# 
# print(f"Number of B-PER entities found: {b_per_count}")

In [95]:
# combine dataframes and where there is no name insert none
# Where there is none take the last identified name/speaker name for that line of text.

In [40]:
# Ensure the column contains string type for the search to work correctly
df_parties['Name'] = df_parties['Name'].astype(str)

# Search for 'Michael Hartmann' in the 'Name' column
filtered_df = df_parties[df_parties['Name'].str.contains('Tobias', na=False)]
filtered_df

Unnamed: 0,Fraktion,Position,Name,Election_Period
23,Grüne,Abgeordnete*r,Tobias B. Bacherle,20
381,Grüne,Abgeordnete*r,Tobias Lindner,20
493,AfD,Abgeordnete*r,Tobias Matthias Peterka,20
717,CDU/CSU (CSU),Abgeordnete*r,Tobias Winkler,20
1126,Grüne,Abgeordnete*r,Tobias Lindner,19
1242,AfD,Abgeordnete*r,Tobias Matthias Peterka,19
1246,Linke,Abgeordnete*r,Tobias Pflüger,19
1817,GRÜNE,Abgeordnete*r,Tobias Lindner,18
2123,CSU,Abgeordnete*r,Tobias Zech,18
2488,GRÜNE,Abgeordnete*r,Tobias Lindner,17


In [41]:
clean_combined = combined_new.drop_duplicates(subset=['Name'])

In [42]:
parties_no_duplicates = df_parties.drop_duplicates(subset=['Name'])

In [43]:
clean_combined

Unnamed: 0,Election_Period_x,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions,Election_Period_y,Name,Fraktion
0,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Meine sehr verehrten Damen und sehr geehrten H...,,15,Otto Schily,SPD
38,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Franz Müntefering,Herr Präsident!,,15,Franz Müntefering,SPD
82,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Wolfgang Thierse,"Herr Alterspräsident, ich nehme die Wahl an.",Beifall im ganzen Hause Abgeordnete aller Frak...,15,Wolfgang Thierse,SPD
143,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Eugen Gerstenmaier,dern möglichst gute Gesetze machen.,,15,,
183,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Dr. Gesine Lötzsch,Herr Präsident!,,15,Gesine Lötzsch,PDS
...,...,...,...,...,...,...,...,...,...,...,...
4654662,20,87,2023-03-01,0 days 13:00:00,0 days 18:56:00,Emily Vontz,"Vielen Dank, Frau Präsidentin.",,20,Emily Vontz,SPD
4675568,20,91,2023-03-16,0 days 09:00:00,0 days 23:03:00,Alexander Föhr,Vielen Dank für die freundliche Begrüßung in d...,,20,Alexander Föhr,CDU/CSU (CDU)
4688137,20,94,2023-03-30,0 days 09:00:00,0 days 21:27:00,Dirk Ulrich Mende,Sehr geehrte Frau Präsidentin!,,20,Dirk-Ulrich Mende,SPD
4746219,20,107,2023-05-26,0 days 09:00:00,0 days 16:06:00,Ana Maria Trăsnea,Sehr geehrte Frau Präsidentin!,,20,Ana-Maria Trăsnea,SPD


In [44]:
final_df = combined_new.merge(parties_no_duplicates, on='Name', how='left')

In [45]:
final_df.shape

(4864724, 14)

In [46]:
final_df.isna().sum()

Election_Period_x          0
Session                    0
Date                       0
Start                      0
End_Time                   0
Speaker                    0
Text_Spoken                0
Reactions            4105647
Election_Period_y          0
Name                  334112
Fraktion_x            334112
Fraktion_y            334112
Position              334112
Election_Period       334112
dtype: int64

# Change party names and make it coherent

In [47]:
replacement_dict = {
    'CDU/CSU (CDU)': 'CDU',
    'CDU/CSU (CSU)': 'CSU',
    'fraktionslos (AfD)': 'AfD',
    'AfD (parteilos)': 'AfD',
    'Grüne': 'Die Grünen',
    'GRÜNE': 'Die Grünen',
    'Bündnis 90/Die Grünen': 'Die Grünen',
    'Die Linke': 'Die Linke',
    'DIE LINKE': 'Die Linke',
    'Linke': 'Die Linke',
    'Linke (parteilos)': 'Die Linke',
    'fraktionslos (LKR)': 'LKR',
    'fraktionslos(SSW)': 'SSW'
}

# Replace the values in the dataframe using the dictionary.
final_df['Fraktion_x'] = final_df['Fraktion_x'].replace(replacement_dict)

In [48]:
final_df.shape

(4864724, 14)

In [49]:
final_df.isna().sum()

Election_Period_x          0
Session                    0
Date                       0
Start                      0
End_Time                   0
Speaker                    0
Text_Spoken                0
Reactions            4105647
Election_Period_y          0
Name                  334112
Fraktion_x            334112
Fraktion_y            334112
Position              334112
Election_Period       334112
dtype: int64

In [50]:
final_df.dropna(subset=['Name'], inplace=True)

In [51]:
final_df.shape

(4530612, 14)

# Change column names

In [56]:
final_df.head()

Unnamed: 0,Election_Period,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions,Name,Party,Position
0,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Meine sehr verehrten Damen und sehr geehrten H...,,Otto Schily,SPD,Abgeordnete*r
1,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Von Paul Löbeüber Konrad Adenauerbis hin zu al...,,Otto Schily,SPD,Abgeordnete*r
2,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Nur Willy Brandtwar 1983 acht Monate jünger, a...",,Otto Schily,SPD,Abgeordnete*r
3,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Das Amt des Alterspräsiden ten blieb Willy Bra...,,Otto Schily,SPD,Abgeordnete*r
4,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Den Hinweis daraufsollten Sie, was meine Leben...",Heiterkeit bei der SPD und dem BÜND NIS 90/DIE...,Otto Schily,SPD,Abgeordnete*r


In [53]:
final_df.drop(columns=['Election_Period', 'Election_Period_y', 'Fraktion_x'], inplace=True)

In [55]:
final_df.rename(columns={'Fraktion_y': 'Party', 'Election_Period_x': 'Election_Period'}, inplace=True)

In [58]:
final_df.to_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/Final_DF/combined_df.csv', index=False)