# Imports and pickle

In [1]:
import pandas as pd
import pickle
import re
import os
import PyPDF2

In [2]:
# Define the folder where your .pkl files are located
folder_path = "/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/speeches_bundestag/pickel_22_copy"

# Initialize an empty dictionary to store loaded DataFrames
loaded_dfs = {}

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pkl"):
        file_path = os.path.join(folder_path, filename)
        
        # Load the pickled object
        with open(file_path, 'rb') as f:
            loaded_object = pickle.load(f)
        
        # Check if the loaded object is a DataFrame
        if isinstance(loaded_object, pd.DataFrame):
            loaded_dfs[filename] = loaded_object
        elif isinstance(loaded_object, list):
            # Check if the list contains DataFrames
            if all(isinstance(item, pd.DataFrame) for item in loaded_object):
                # Convert list of DataFrames to a single DataFrame
                concatenated_df = pd.concat(loaded_object)
                loaded_dfs[filename] = concatenated_df
            else:
                print(f"Skipping {filename}, list does not contain DataFrames.")
        else:
            print(f"Skipping {filename}, not a DataFrame or list of DataFrames.")

# Concatenate individual DataFrames into one big DataFrame
if loaded_dfs:
    df = pd.concat(loaded_dfs.values(), keys=loaded_dfs.keys())
else:
    print("No DataFrames were loaded.")

# Missing values

In [3]:
df.shape

(5120245, 8)

In [4]:
df.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start           865122
Schluss         921065
Speaker              0
Text_Spoken          0
Reactions      4324247
dtype: int64

In [5]:
df.head(40)

Unnamed: 0,Unnamed: 1,Wahlperiode,Sitzung,Date,Start,Schluss,Speaker,Text_Spoken,Reactions
dfs_batch_58.pkl,1,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Ich wünsche Ihnen allen einen schönen guten Ta...,
dfs_batch_58.pkl,2,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Die Sitzung ist eröffnet.,
dfs_batch_58.pkl,3,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Vor Eintritt in die Tagesordnung: Liebe Kolleg...,
dfs_batch_58.pkl,4,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,"Liebe Kolleginnen und Kollegen, die Covid 19 P...",
dfs_batch_58.pkl,5,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Laut Robert Koch Institut sind die aktuellen F...,
dfs_batch_58.pkl,6,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Vor diesem Hintergrund haben sich die Fraktion...,
dfs_batch_58.pkl,7,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,In meiner gestern in Kraft gesetzten Allgemein...,
dfs_batch_58.pkl,8,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Vielmehr gilt die Pflicht zum Tragen einer med...,
dfs_batch_58.pkl,9,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Die Ihnen bekannten Vorschriften zur Befreiung...,
dfs_batch_58.pkl,10,20,4,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Wir kommen nun zur Abstimmung über die erweite...,


# Schluss-Time

## Find Schluss Pattern

In [6]:
# Identify missing Schluss
missing_schluss_df = df[df['Schluss'].isna()]
missing_schluss_tuples = missing_schluss_df.groupby(['Wahlperiode', 'Sitzung']).size().reset_index(name='Missing_Count')

In [7]:
missing_schluss_tuples

Unnamed: 0,Wahlperiode,Sitzung,Missing_Count
0,15,042,1660
1,15,083,2665
2,16,075,2216
3,16,130,4387
4,16,186,6539
...,...,...,...
184,19,107,11118
185,19,108,4939
186,19,109,694
187,19,112,4663


## Find a way to replace multiple at the same time

In [8]:
# Let us get the text to look for other Schluss patterns:
def extract_text_from_pdf(pdf):
    """

    :param pdf: 
    :return: 
    """
    with open(pdf, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

In [9]:
def extract_schluss(path):
    pdf_text = extract_text_from_pdf(path)
    schluss_time = None

    # Search for the pattern "Die Sitzung ist geschlossen. (Schluss: <time>)"
    pattern = r"\Schluss\:*\s*(\d{2}\s*\d*)"
    match = re.search(pattern, pdf_text)

    if match:
        # Extract the captured group which contains the time
        schluss_time = match.group(1).strip()

    return schluss_time

In [10]:
from joblib import Parallel, delayed

def extract_schluss_from_row(row):
    wahlperiode = row['Wahlperiode']
    sitzung = row['Sitzung']
    
    pdf_path = f'/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/speeches_bundestag/1998_2023/{wahlperiode}{sitzung}.pdf'
    schluss_time = extract_schluss(pdf_path)
    
    return (wahlperiode, sitzung, schluss_time)

results = Parallel(n_jobs=-2)(delayed(extract_schluss_from_row)(row) for _, row in missing_schluss_tuples.iterrows())

# Initialize a dictionary to hold new Schluss times
new_schluss = {}

for wahlperiode, sitzung, schluss_time in results:
    if schluss_time:
        new_schluss[(wahlperiode, sitzung)] = schluss_time

In [11]:
new_schluss

{('15', '042'): '16',
 ('16', '075'): '17',
 ('16', '186'): '160',
 ('18', '064'): '16',
 ('18', '124'): '22',
 ('18', '125'): '16',
 ('18', '126'): '16',
 ('18', '127'): '21',
 ('18', '128'): '15',
 ('18', '129'): '17',
 ('18', '131'): '14',
 ('18', '132'): '16',
 ('18', '133'): '21',
 ('18', '135'): '17',
 ('18', '137'): '15',
 ('18', '138'): '19',
 ('18', '139'): '18',
 ('18', '140'): '20',
 ('18', '142'): '16',
 ('18', '143'): '21',
 ('18', '144'): '15',
 ('18', '145'): '17',
 ('18', '146'): '21',
 ('18', '147'): '13',
 ('18', '148'): '17',
 ('18', '150'): '14',
 ('18', '151'): '16',
 ('18', '152'): '21',
 ('18', '153'): '14',
 ('18', '155'): '20',
 ('18', '156'): '14',
 ('18', '158'): '21',
 ('18', '159'): '15',
 ('18', '161'): '21',
 ('18', '162'): '13',
 ('18', '163'): '17',
 ('18', '165'): '15',
 ('18', '166'): '16',
 ('18', '167'): '22',
 ('18', '168'): '14',
 ('18', '169'): '16',
 ('18', '170'): '2009',
 ('18', '171'): '14',
 ('18', '172'): '17',
 ('18', '173'): '22',
 ('18',

In [12]:
len(new_schluss)

169

In [13]:
# Fill in the new Schluss times into the DataFrame
for (wahlperiode, sitzung), schluss_time in new_schluss.items():
    df.loc[(df['Wahlperiode'] == wahlperiode) & (df['Sitzung'] == sitzung), 'Schluss'] = schluss_time

In [14]:
# Check how many missing values left
# Identify missing Schluss
missing_schluss_df = df[df['Schluss'].isna()]
missing_schluss_tuples = missing_schluss_df.groupby(['Wahlperiode', 'Sitzung']).size().reset_index(name='Missing_Count')
missing_schluss_tuples

Unnamed: 0,Wahlperiode,Sitzung,Missing_Count
0,15,83,2665
1,16,130,4387
2,17,89,2698
3,18,234,11190
4,18,243,9434
5,19,26,8021
6,19,29,8870
7,19,30,2435
8,19,36,9072
9,19,39,7008


In [15]:
# 14 042
df.loc[(df['Wahlperiode'] == '15') & (df['Sitzung'] == '083')].tail(20)

Unnamed: 0,Unnamed: 1,Wahlperiode,Sitzung,Date,Start,Schluss,Speaker,Text_Spoken,Reactions
dfs_batch_4.pkl,2646,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,"Deshalb mein Appell an die FDP im Bund, besond...",
dfs_batch_4.pkl,2647,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Selbstverständlichwerden wir ihre Anregungen p...,
dfs_batch_4.pkl,2648,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Im Bereich der Ta gespflege gibt es natürlich ...,
dfs_batch_4.pkl,2649,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Ich denke da beispielsweise an das Qualitätsma...,
dfs_batch_4.pkl,2650,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Hier gibt es gute Verbesserungsansätze.,
dfs_batch_4.pkl,2651,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Die Ta gespflege muss mit Nachdruck aus der Gr...,
dfs_batch_4.pkl,2652,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,"Sie hat besondere Vorzüge, die ihr auchzukünft...",
dfs_batch_4.pkl,2653,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Hier muss aber ein schlüssiges Gesamtkonzept u...,
dfs_batch_4.pkl,2654,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Das ist eine große Herausforderung für unsalle.,
dfs_batch_4.pkl,2655,15,83,12. Dezember 2003,9:00,,Ekin Deligöz,Wenn wir unsere Gesellschaft innovativ umgesta...,


In [16]:
df1 = df.copy()

## Drop all after (Schluss including Schluss

In [17]:
# Initialize an empty list to collect cases where the sentence is not found
not_found_list = []

def truncate_after_session_closed(group):
    # Pattern to match variations of "Die Sitzung ist geschlossen."
    pattern = re.compile(r"\(Schluss", re.IGNORECASE)
    
    # Find index where pattern matches
    idx_closed = group[group['Text_Spoken'].apply(lambda x: bool(pattern.search(str(x))))].index
    
    # If pattern not found, append to not_found_list
    if len(idx_closed) == 0:
        not_found_list.append({'Wahlperiode': group['Wahlperiode'].iloc[0], 'Sitzung': group['Sitzung'].iloc[0]})
    
    return group.loc[:idx_closed[-1]] if len(idx_closed) > 0 else group

# Group by 'Wahlperiode' and 'Sitzung' and apply the function
df_1grouped = df1.groupby(['Wahlperiode', 'Sitzung'])
df_1truncated = df_1grouped.apply(truncate_after_session_closed).reset_index(drop=True)

# Convert not_found_list to a DataFrame
not_found_df = pd.DataFrame(not_found_list)

print(not_found_df)

     Wahlperiode Sitzung
0             15     001
1             15     002
2             15     003
3             15     004
4             15     005
...          ...     ...
1269          20     128
1270          20     129
1271          20     130
1272          20     131
1273          20     132

[1274 rows x 2 columns]


In [18]:
df2 = df_1truncated.copy()

In [19]:
df1.shape[0]-df2.shape[0]

1314

In [20]:
df2.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start           864202
Schluss         150500
Speaker              0
Text_Spoken          0
Reactions      4322956
dtype: int64

# Start

## Find Missing Start Time

In [21]:
# Filter rows where the Start column is missing in df_1
missing_start_df = df2[df2['Start'].isna()]

# Group by Wahlperiode and Sitzung and count the number of missing starts for each group
grouped_missing_starts = missing_start_df.groupby(['Wahlperiode', 'Sitzung']).size().reset_index(name='Missing_Count')

# Display the result
print(grouped_missing_starts)

    Wahlperiode Sitzung  Missing_Count
0            18     124           7722
1            18     125           3743
2            18     126           2438
3            18     127           6675
4            18     128           3283
..          ...     ...            ...
172          19     104           9760
173          19     105           5218
174          19     107          11118
175          19     108           4939
176          19     109            694

[177 rows x 3 columns]


In [22]:
# Function to extract the start time
def extract_start(path):
    pdf_text = extract_text_from_pdf(path)
    start_time = None
    
    # Search for the word "Beginn:"
    start_idx = pdf_text.find("Beginn")
    
    if start_idx != -1:
        # Extract the following text (assuming time format as HH:MM, hence taking 5 characters)
        start_time = pdf_text[start_idx + len("Beginn"): start_idx + len("Beginn") + 5].strip()
        
    return start_time

In [23]:
# Step 1: Identify missing rows
missing_start_tuples = grouped_missing_starts[['Wahlperiode', 'Sitzung']].apply(tuple, axis=1)

# Initialize a dictionary to hold the new Start times
new_starts = {}

# Step 2: Loop through PDFs
for wahlperiode, sitzung in missing_start_tuples:
    pdf_path = f"/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/speeches_bundestag/1998_2023/{wahlperiode}{sitzung}.pdf"  # Replace with your actual folder path
    
    try:
        # Extract Start time
        time = extract_start(pdf_path)
        
        if time:  # Check if time is not None or empty
            new_starts[(wahlperiode, sitzung)] = time
    except Exception as e:
        print(f"An error occurred while processing {pdf_path}. Error: {e}")

In [24]:
new_starts

{('18', '124'): ': 9',
 ('18', '125'): ': 9',
 ('18', '126'): ': 13',
 ('18', '127'): ': 9',
 ('18', '128'): ': 9',
 ('18', '129'): ': 13',
 ('18', '131'): ': 9',
 ('18', '132'): ': 13',
 ('18', '133'): ': 9',
 ('18', '135'): ': 13',
 ('18', '137'): ': 9',
 ('18', '138'): ': 10',
 ('18', '139'): ': 9',
 ('18', '140'): ': 9',
 ('18', '142'): 'der',
 ('18', '143'): ': 9',
 ('18', '144'): ': 9',
 ('18', '145'): ': 13',
 ('18', '146'): ': 9',
 ('18', '147'): ': 9',
 ('18', '148'): ': 13',
 ('18', '150'): ': 9',
 ('18', '151'): ': 13',
 ('18', '152'): ': 9',
 ('18', '153'): ': 9',
 ('18', '155'): ': 9',
 ('18', '156'): ': 9',
 ('18', '158'): ': 9',
 ('18', '159'): ': 9',
 ('18', '161'): ': 9',
 ('18', '162'): ': 9',
 ('18', '163'): ': 13',
 ('18', '165'): ': 9',
 ('18', '166'): ': 13',
 ('18', '167'): ': 9',
 ('18', '168'): ': 9',
 ('18', '169'): ': 13',
 ('18', '170'): ': 9',
 ('18', '171'): ': 9',
 ('18', '172'): ': 13',
 ('18', '173'): ': 9',
 ('18', '175'): ': 13',
 ('18', '176'): ': 9'

In [25]:
len(new_starts)

177

In [26]:
# Step 3: Update DataFrame
for (wahlperiode, sitzung), start_time in new_starts.items():
    df2.loc[(df2['Wahlperiode'] == wahlperiode) & (df2['Sitzung'] == sitzung), 'Start'] = start_time

## Drop All Before Beginn

In [27]:
def truncate_after_session_closed(group):
    # Pattern to match variations of "Die Sitzung ist geschlossen."
    pattern = re.compile(r"Beginn", re.IGNORECASE)
    
    # Find index where pattern matches
    idx_start = group[group['Text_Spoken'].apply(lambda x: bool(pattern.search(str(x))))].index
    
    # If pattern not found, append to not_found_list
    if len(idx_start) == 0:
        not_found_list.append({'Wahlperiode': group['Wahlperiode'].iloc[0], 'Sitzung': group['Sitzung'].iloc[0]})
    else:
        # Select the first index where the pattern is found and add one
        idx_start = idx_start[0] + 1
        # Return the truncated group starting from idx_start to the end
        return group.loc[idx_start:]
    
    # If pattern not found, return the group unmodified
    return group

# Group by 'Wahlperiode' and 'Sitzung' and apply the function
df_2grouped = df2.groupby(['Wahlperiode', 'Sitzung'])
df_2truncated = df_2grouped.apply(truncate_after_session_closed).reset_index(drop=True)

# Convert not_found_list to a DataFrame
not_found_df = pd.DataFrame(not_found_list)

print(not_found_df)

     Wahlperiode Sitzung
0             15     001
1             15     002
2             15     003
3             15     004
4             15     005
...          ...     ...
1277          16     232
1278          17     185
1279          18     035
1280          18     223
1281          19     168

[1282 rows x 2 columns]


In [28]:
df3 = df_2truncated.copy()

In [29]:
df3.shape

(4864724, 8)

In [30]:
df3.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start                0
Schluss         146477
Speaker              0
Text_Spoken          0
Reactions      4105647
dtype: int64

# Datatpyes

In [31]:
df3.dtypes

Wahlperiode    object
Sitzung        object
Date           object
Start          object
Schluss        object
Speaker        object
Text_Spoken    object
Reactions      object
dtype: object

## Wahlperiode & Sitzung

In [32]:
df3['Wahlperiode'] = df3['Wahlperiode'].astype(int)
df3['Sitzung'] = df3['Sitzung'].astype(int)

In [33]:
df4 = df3.copy()

## Date

In [34]:
import calendar

In [35]:
# Create a mapping of German month names to English
german_to_english = {
    'Januar': 'January',
    'Februar': 'February',
    'März': 'March',
    'April': 'April',
    'Mai': 'May',
    'Juni': 'June',
    'Juli': 'July',
    'August': 'August',
    'September': 'September',
    'Oktober': 'October',
    'November': 'November',
    'Dezember': 'December'
}

In [36]:
# Replace German month names with English month names
df4['Date'] = df4['Date'].replace(german_to_english, regex=True)

# Now convert to datetime
df4['Temp_Date'] = pd.to_datetime(df4['Date'], format='%d. %B %Y', errors='coerce')

In [37]:

# Create a boolean mask for entries where the conversion failed
mask = df4['Temp_Date'].isna()

# Group by the original 'Date' column and additional columns for rows where the conversion failed
problematic_dates = df4[mask].groupby(['Wahlperiode', 'Sitzung', 'Date']).size().reset_index(name='Count')

# Sort by 'Count' to easily spot frequently occurring problematic entries
problematic_dates = problematic_dates.sort_values(by='Count', ascending=False)

# Drop the temporary 'Temp_Date' column
df4.drop(columns=['Temp_Date'], inplace=True)

# Print out the problematic dates along with 'Wahlperiode' and 'Sitzung'
print("Counts of problematic entries by Wahlperiode, Sitzung, and Date:")
print(problematic_dates)

Counts of problematic entries by Wahlperiode, Sitzung, and Date:
   Wahlperiode  Sitzung        Date  Count
0           15       16  12. und 13   4243


In [38]:
# Manually update the Date here
df4.loc[(df4['Wahlperiode'] == 15) & (df4['Sitzung'] == 16), 'Date'] = '19. December 2002'

In [39]:
df4['Date'] = pd.to_datetime(df4['Date'], format='%d. %B %Y')

In [40]:
df5 = df4.copy()

## Start

In [41]:
# First inspect how Start can looks like I think something like this
# dd needs to be lower than 24
# dd dd
# letters

In [42]:
# Strip
df5['Start'] = df5['Start'].str.strip()
df5['Start'] = df5['Start'].str.replace(r'\s+', '', regex=True)

In [43]:
def clean_start_time(t):
    # Check for any non-digit characters
        # Remove all whitespaces from the input
    t = re.sub(r'\s+', '', t)
    
    # Remove all characters except digits and colon
    t = re.sub(r'[^\d:]', '', t)
    if re.search('[a-zA-Z]', t):
        return None
    
    elif re.match(r'^:\d{1}$', t):
        return f"0{t[1]}:00:00"
        # Handle cases like "13:45" or 9:33 :9
    
    elif re.match(r'^:\d{2}$', t):
        return f"{t[1]}:00:00"
    
    elif ":" in t:
        parts = t.split(':')
        if len(parts) == 2:
            # Check if first part is empty and second part is a single digit
            if parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 1:
                return f"0{parts[1]}:00:00"
            elif parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 2:
                return f"{parts[1]}:00:00"
            # Existing checks for parts with length of 2 each
            elif all(part.isdigit() for part in parts):
                hours, minutes = parts
                if int(hours) < 24 and int(minutes) < 60:
                    return f"{hours.zfill(2)}:{minutes.zfill(2)}:00"
    
    # Length of 4 indicates HHMM format
    if len(t) == 4:
        return f"{t[:2]}:{t[2:]}:00"
    
    # Length of 3 indicates HMM format
    elif len(t) == 3:
        # Check if minutes exceed 59
        if int(t[1:]) < 60:
            return f"0{t[0]}:{t[1:]}:00"
        elif int(t[:1]) < 24:
            return f"0{t[:1]}:{t[2]}:00"
    
    # Length of 2 could be HH format
    elif len(t) == 2:
        # Check if it's a valid hour
        if int(t) <= 24:
            return f"{t}:00:00"

    # Length of 1 indicates a single digit hour
    elif len(t) == 1:
        return f"{t}:00:00"

    # Handle cases like "13 45"
    elif " " in t:
        hours, minutes = t.split()
        if int(hours) <= 24 and int(minutes) < 60:
            return f"{int(hours)}:{int(minutes)}:00"
        
    else:
        # If none of the above conditions met, return None
        return None

In [44]:
# Apply the function to the Start column
df5['Start'] = df5['Start'].apply(lambda x: clean_start_time(str(x)))

18 138

In [45]:
# Now convert the cleaned times to timedelta
df5['Start'] = pd.to_timedelta(df5['Start'].dropna())

In [46]:
df5.sample(10)

Unnamed: 0,Wahlperiode,Sitzung,Date,Start,Schluss,Speaker,Text_Spoken,Reactions
137593,15,54,2003-06-27,0 days 09:00:00,14:22,Werner Wittlich,Auf der einen Seite arbeiten in Deutschland 13...,
3462134,19,55,2018-10-11,0 days 09:00:00,,Markus Kurth,Daneben erfordert die erfreulicherweise höhere...,
1248050,16,221,2009-05-13,0 days 13:01:00,21:48,Omid Nouripour,Dasstimmt nicht.,
356248,15,132,2004-10-21,0 days 09:00:00,22:11,Vizepräsident Dr. Norbert Lammert,Nun hat der Kollege Georg Brunnhuber für die C...,Beifall bei der CDU/CSU Wilhelm Schmidt[Salzgi...
731678,16,70,2006-11-30,0 days 09:00:00,23:11,Irmingard Schewe Gerigk,Über die progressive Beitragssenkung wollen wi...,
2658436,18,103,2015-05-07,0 days 09:00:00,20:33,Johannes Röring,Die Initiative ist ein erster Ansatz.,
1905251,17,156,2012-01-27,0 days 10:30:00,15:00,Tabea Rößner,Wenn digitale Angebote jedoch nicht barrierefr...,
698879,16,61,2006-10-27,0 days 09:01:00,14:52,Renate Künast,"Sie haben gesagt, es solle ein Konzept für nac...",
4390326,20,28,2022-04-07,0 days 09:00:00,23:29,Dr. Christoph Plo,"Aber ich denke mal, zwischen den Parteien in d...",
2420488,18,33,2014-05-08,0 days 09:02:00,22:53,Omid Nouripour,Deswegen muss man diese Double Standards ge ra...,


In [47]:
df6 = df5.copy()

## Schluss

In [48]:
def clean_schluss_time(t):    # Check for any non-digit characters
        # Remove all whitespaces from the input
    t = re.sub(r'\s+', '', t)
    
    # Remove all characters except digits and colon
    t = re.sub(r'[^\d:]', '', t)
    if re.search('[a-zA-Z]', t):
        return None
    
    elif re.match(r'^:\d{1}$', t):
        return f"0{t[1]}:00:00"
        # Handle cases like "13:45" or 9:33 :9
    
    elif re.match(r'^:\d{2}$', t):
        return f"{t[1]}:00:00"
    
    elif ":" in t:
        parts = t.split(':')
        if len(parts) == 2:
            # Check if first part is empty and second part is a single digit
            if parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 1:
                return f"0{parts[1]}:00:00"
            elif parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 2:
                return f"{parts[1]}:00:00"
            # Existing checks for parts with length of 2 each
            elif all(part.isdigit() for part in parts):
                hours, minutes = parts
                if int(hours) < 24 and int(minutes) < 60:
                    return f"{hours.zfill(2)}:{minutes.zfill(2)}:00"
    
    # Length of 4 indicates HHMM format
    if len(t) == 4:
        return f"{t[:2]}:{t[2:]}:00"
    
    # Length of 3 indicates HMM format
    elif len(t) == 3:
        # Check if minutes exceed 59
        if int(t[1:]) < 60:
            return f"0{t[0]}:{t[1:]}:00"
        elif int(t[:1]) < 24:
            return f"0{t[:1]}:{t[2]}:00"
    
    # Length of 2 could be HH format
    elif len(t) == 2:
        # Check if it's a valid hour
        if int(t) <= 24:
            return f"{t}:00:00"

    # Length of 1 indicates a single digit hour
    elif len(t) == 1:
        return f"{t}:00:00"

    # Handle cases like "13 45"
    elif " " in t:
        hours, minutes = t.split()
        if int(hours) <= 24 and int(minutes) < 60:
            return f"{int(hours)}:{int(minutes)}:00"
        
    else:
        # If none of the above conditions met, return None
        return None

In [49]:
# Apply the function to the Start column
df6['Schluss'] = df6['Schluss'].apply(lambda x: clean_schluss_time(str(x)))

In [50]:
# Now convert the cleaned times to timedelta
df6['Schluss'] = pd.to_timedelta(df6['Schluss'].dropna())

In [51]:
df6.head()

Unnamed: 0,Wahlperiode,Sitzung,Date,Start,Schluss,Speaker,Text_Spoken,Reactions
0,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Meine sehr verehrten Damen und sehr geehrten H...,
1,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Von Paul Löbeüber Konrad Adenauerbis hin zu al...,
2,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Nur Willy Brandtwar 1983 acht Monate jünger, a...",
3,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Das Amt des Alterspräsiden ten blieb Willy Bra...,
4,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Den Hinweis daraufsollten Sie, was meine Leben...",Heiterkeit bei der SPD und dem BÜND NIS 90/DIE...


In [52]:
df7 = df6.copy()

# Impute Missing Values

In [53]:
df7.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start             9004
Schluss         159060
Speaker              0
Text_Spoken          0
Reactions      4105647
dtype: int64

## Start

In [54]:
# Filter rows where the Start column is missing in df_1
missing_start_df = df7[df7['Start'].isna()]

# Group by Wahlperiode and Sitzung and count the number of missing starts for each group
grouped_missing_starts = missing_start_df.groupby(['Wahlperiode', 'Sitzung']).size().reset_index(name='Missing_Count')

# Display the result
print(grouped_missing_starts)

   Wahlperiode  Sitzung  Missing_Count
0           18      142           2157
1           19        6           1385
2           19       70           2486
3           19       82           2976


In [55]:
# Let me do Start by Hand
manual_starts = {
    (18, 142): '12:30:00',
    (19, 6): '13:00:00',
    (19, 70): '13:00:00',
    (19, 82): '13:00:00',
}

In [56]:
# Iterate over the manual_starts items
for (wahlperiode, sitzung), start_time in manual_starts.items():
    # Convert start_time to a timedelta object
    start_time_delta = pd.to_timedelta(start_time)
    print(start_time_delta)
    # Apply the timedelta where conditions are met
    df7.loc[(df7['Wahlperiode'] == wahlperiode) & (df7['Sitzung'] == sitzung), 'Start'] = start_time_delta

0 days 12:30:00
0 days 13:00:00
0 days 13:00:00
0 days 13:00:00


In [57]:
df7.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start                0
Schluss         159060
Speaker              0
Text_Spoken          0
Reactions      4105647
dtype: int64

In [58]:
df8 = df7.copy()

## Schluss

In [59]:
df8.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start                0
Schluss         159060
Speaker              0
Text_Spoken          0
Reactions      4105647
dtype: int64

In [60]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Calculate the duration only for rows where 'Schluss' is not None
df8['Dauer'] = df8.apply(lambda row: row['Schluss'] - row['Start'] if pd.notna(row['Schluss']) else None, axis=1)

# Now convert 'Dauer' to seconds only for non-None values
df8['Dauer_in_Sekunden'] = df8['Dauer'].dt.total_seconds()

# Prepare your features and target variable, excluding rows where 'Dauer' is None
X = df8[['Wahlperiode', 'Sitzung']]
y = df8['Dauer_in_Sekunden']

# Use only the rows with non-missing 'Dauer_in_Sekunden' to train the model
X_train = X[y.notna()]
y_train = y[y.notna()]

# Initialize the linear regression model and fit it to the data
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the 'Dauer_in_Sekunden' for missing values
X_missing = X[y.isna()]
print(X_missing)

         Wahlperiode  Sitzung
217323            15       83
217324            15       83
217325            15       83
217326            15       83
217327            15       83
...              ...      ...
3741134           19      112
3741135           19      112
3741136           19      112
3741137           19      112
3741138           19      112

[159060 rows x 2 columns]


In [61]:
predicted_durations = model.predict(X_missing)
predicted_durations

array([36555.16688178, 36555.16688178, 36555.16688178, ...,
       30464.01629555, 30464.01629555, 30464.01629555])

In [62]:

# Fill the missing 'Dauer_in_Sekunden' with the predicted values
df8.loc[y.isna(), 'Dauer_in_Sekunden'] = predicted_durations
df8.isna().sum()

Wahlperiode                0
Sitzung                    0
Date                       0
Start                      0
Schluss               159060
Speaker                    0
Text_Spoken                0
Reactions            4105647
Dauer                 159060
Dauer_in_Sekunden          0
dtype: int64

In [63]:
missing_indices = df8.loc[df8['Schluss'].isna()].index
# Impute the missing 'Schluss' values using the predicted durations
df8.loc[missing_indices, 'Schluss'] = df8.loc[missing_indices, 'Start'] + pd.to_timedelta(df8.loc[missing_indices, 'Dauer_in_Sekunden'], unit='s')
df8.isna().sum()

Wahlperiode                0
Sitzung                    0
Date                       0
Start                      0
Schluss                    0
Speaker                    0
Text_Spoken                0
Reactions            4105647
Dauer                 159060
Dauer_in_Sekunden          0
dtype: int64

In [64]:
# Clean up the DataFrame by dropping intermediate columns if desired
df8 = df8.drop(['Dauer', 'Dauer_in_Sekunden'], axis=1)

# View the updated DataFrame
df8.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start                0
Schluss              0
Speaker              0
Text_Spoken          0
Reactions      4105647
dtype: int64

In [65]:
df9 = df8.copy()

In [66]:
df9.dtypes

Wahlperiode              int64
Sitzung                  int64
Date            datetime64[ns]
Start          timedelta64[ns]
Schluss        timedelta64[ns]
Speaker                 object
Text_Spoken             object
Reactions               object
dtype: object

In [67]:
df9.shape

(4864724, 8)

In [68]:
df9.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start                0
Schluss              0
Speaker              0
Text_Spoken          0
Reactions      4105647
dtype: int64

In [69]:
df9.to_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/Final_Data/final.csv', index=False, encoding='utf-8')

In [70]:
df9.loc[(df9['Wahlperiode'] == 15) & (df9['Sitzung']== 9)]

Unnamed: 0,Wahlperiode,Sitzung,Date,Start,Schluss,Speaker,Text_Spoken,Reactions
12957,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Heidemarie Wieczorek Zeul,"Es wirdvor allem darum gehen, ein Umdenken zus...",
12958,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Heidemarie Wieczorek Zeul,"Es geht auch darum, in der Zusammenarbeit mit ...",
12959,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Heidemarie Wieczorek Zeul,"Es geht ferner darum,beim Aufbau eines unabhän...",
12960,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Heidemarie Wieczorek Zeul,Daran haltenwir die afghanische Regierung fest.,
12961,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Heidemarie Wieczorek Zeul,"Wie ich schon sagte, habe ich heute Nachmittag...",
...,...,...,...,...,...,...,...,...
14597,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Dr. Rainer Wend,Wenn Sie die Trennung konsequent zu Ende bring...,
14598,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Dr. Rainer Wend,"Diesen Trennungsstrich müssen Sie ziehen, um b...",Beifall bei der SPD und dem BÜNDNIS 90/DIE GRÜ...
14599,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Vizepräsident Dr. Norbert Lammert,"Meine Damen und Herren, wir sind am Ende der A...",
14600,15,9,2002-11-13,0 days 13:00:00,0 days 16:57:00,Vizepräsident Dr. Norbert Lammert,Ich berufe die nächste Sitzung des Deutschen B...,


In [71]:
df9.to_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/speeches_bundestag/Final_DF/final_text_speaker_df_2.csv', index=False, encoding='utf-8')