# Imports and pickle

In [6]:
import pandas as pd
import pickle
import re
import os
import PyPDF2
from joblib import Parallel, delayed

In [137]:
# Define the folder where your .pkl files are located
folder_path = "/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/pickel_22_copy"

# Initialize an empty dictionary to store loaded DataFrames
loaded_dfs = {}

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pkl"):
        file_path = os.path.join(folder_path, filename)
        
        # Load the pickled object
        with open(file_path, 'rb') as f:
            loaded_object = pickle.load(f)
        
        # Check if the loaded object is a DataFrame
        if isinstance(loaded_object, pd.DataFrame):
            loaded_dfs[filename] = loaded_object
        elif isinstance(loaded_object, list):
            # Check if the list contains DataFrames
            if all(isinstance(item, pd.DataFrame) for item in loaded_object):
                # Convert list of DataFrames to a single DataFrame
                concatenated_df = pd.concat(loaded_object)
                loaded_dfs[filename] = concatenated_df
            else:
                print(f"Skipping {filename}, list does not contain DataFrames.")
        else:
            print(f"Skipping {filename}, not a DataFrame or list of DataFrames.")

# Concatenate individual DataFrames into one big DataFrame
if loaded_dfs:
    df = pd.concat(loaded_dfs.values(), keys=loaded_dfs.keys())
else:
    print("No DataFrames were loaded.")

# Missing values

In [138]:
df.shape

(5120245, 8)

In [139]:
df.isna().sum()

Wahlperiode          0
Sitzung              0
Date                 0
Start           865122
Schluss         921065
Speaker              0
Text_Spoken          0
Reactions      4324247
dtype: int64

In [140]:
df

Unnamed: 0,Unnamed: 1,Wahlperiode,Sitzung,Date,Start,Schluss,Speaker,Text_Spoken,Reactions
dfs_batch_58.pkl,1,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Ich wünsche Ihnen allen einen schönen guten Ta...,
dfs_batch_58.pkl,2,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Die Sitzung ist eröffnet.,
dfs_batch_58.pkl,3,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Vor Eintritt in die Tagesordnung: Liebe Kolleg...,
dfs_batch_58.pkl,4,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,"Liebe Kolleginnen und Kollegen, die Covid 19 P...",
dfs_batch_58.pkl,5,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Laut Robert Koch Institut sind die aktuellen F...,
...,...,...,...,...,...,...,...,...,...
dfs_batch_55.pkl,4078,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Dann verfahren wir wie vorgeschlagen.,
dfs_batch_55.pkl,4079,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Wir sind am Schluss der heutigen Tagesordnung.,
dfs_batch_55.pkl,4080,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Ich berufe die nächste Sitzung des Deutschen B...,
dfs_batch_55.pkl,4081,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Die Sitzung ist geschlossen.,Schluss: 2024 Uhr.


# Rename columns

In [141]:
# Renaming the columns, you wont need this if you named columns correctly from the start
#df.rename(columns={'Wahlperiode': 'Election_Period', 'Sitzung': 'Session', 'Schluss': 'End_Time'}, inplace=True)

In [142]:
df

Unnamed: 0,Unnamed: 1,Election_Period,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions
dfs_batch_58.pkl,1,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Ich wünsche Ihnen allen einen schönen guten Ta...,
dfs_batch_58.pkl,2,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Die Sitzung ist eröffnet.,
dfs_batch_58.pkl,3,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Vor Eintritt in die Tagesordnung: Liebe Kolleg...,
dfs_batch_58.pkl,4,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,"Liebe Kolleginnen und Kollegen, die Covid 19 P...",
dfs_batch_58.pkl,5,20,004,7. Dezember 2021,12:00,13:41,Präsidentin Bärbel Bas,Laut Robert Koch Institut sind die aktuellen F...,
...,...,...,...,...,...,...,...,...,...
dfs_batch_55.pkl,4078,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Dann verfahren wir wie vorgeschlagen.,
dfs_batch_55.pkl,4079,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Wir sind am Schluss der heutigen Tagesordnung.,
dfs_batch_55.pkl,4080,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Ich berufe die nächste Sitzung des Deutschen B...,
dfs_batch_55.pkl,4081,19,201,16. Dezember 2020,13:00,20:24,Vizepräsident Dr. Hans Peter Friedrich,Die Sitzung ist geschlossen.,Schluss: 2024 Uhr.


# Ending-Time

## Find Schluss Pattern

In [143]:
# Identify missing Schluss
missing_schluss_df = df[df['End_Time'].isna()]
missing_schluss_tuples = missing_schluss_df.groupby(['Election_Period', 'Session']).size().reset_index(name='Missing_Count')

In [144]:
missing_schluss_tuples

Unnamed: 0,Election_Period,Session,Missing_Count
0,15,042,1660
1,15,083,2665
2,16,075,2216
3,16,130,4387
4,16,186,6539
...,...,...,...
184,19,107,11118
185,19,108,4939
186,19,109,694
187,19,112,4663


## Find a way to replace multiple at the same time

In [145]:
# Let us get the text to look for other Schluss patterns:
def extract_text_from_pdf(pdf):
    """

    :param pdf: 
    :return: 
    """
    with open(pdf, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

In [146]:
def extract_schluss(path):
    pdf_text = extract_text_from_pdf(path)
    end_time = None

    # Search for the pattern "Die Sitzung ist geschlossen. (Schluss: <time>)"
    pattern = r"\Schluss\:*\s*(\d{2}\s*\d*)"
    match = re.search(pattern, pdf_text)

    if match:
        # Extract the captured group which contains the time
        end_time = match.group(1).strip()

    return end_time

In [147]:
def extract_schluss_from_row(row):
    election_period = row['Election_Period']
    session = row['Session']
    
    pdf_path = f'/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/1998_2023/{election_period}{session}.pdf'
    end_time = extract_schluss(pdf_path)
    
    return (election_period, session, end_time)

results = Parallel(n_jobs=-2)(delayed(extract_schluss_from_row)(row) for _, row in missing_schluss_tuples.iterrows())

# Initialize a dictionary to hold new Schluss times
new_schluss = {}

for election_period, session, end_time in results:
    if end_time:
        new_schluss[(election_period, session)] = end_time

In [148]:
# Fill in the new Schluss times into the DataFrame
for (election_period, session), end_time in new_schluss.items():
    df.loc[(df['Election_Period'] == election_period) & (df['Session'] == session), 'End_Time'] = end_time

In [149]:
# Check how many missing values left
# Identify missing Schluss
missing_schluss_df = df[df['End_Time'].isna()]
missing_schluss_tuples = missing_schluss_df.groupby(['Election_Period', 'Session']).size().reset_index(name='Missing_Count')
missing_schluss_tuples

Unnamed: 0,Election_Period,Session,Missing_Count
0,15,83,2665
1,16,130,4387
2,17,89,2698
3,18,234,11190
4,18,243,9434
5,19,26,8021
6,19,29,8870
7,19,30,2435
8,19,36,9072
9,19,39,7008


In [150]:
df1 = df.copy()

## Drop all after (Schluss including Schluss)

In [151]:
# Initialize an empty list to collect cases where the sentence is not found
not_found_list = []

def truncate_after_session_closed(group):
    # Pattern to match variations of "Die Sitzung ist geschlossen."
    pattern = re.compile(r"\(Schluss", re.IGNORECASE)
    
    # Find index where pattern matches
    idx_closed = group[group['Text_Spoken'].apply(lambda x: bool(pattern.search(str(x))))].index
    
    # If pattern not found, append to not_found_list
    if len(idx_closed) == 0:
        not_found_list.append({'Election_Period': group['Election_Period'].iloc[0], 'Session': group['Session'].iloc[0]})
    
    return group.loc[:idx_closed[-1]] if len(idx_closed) > 0 else group

# Group by 'Election_Period' and 'Session' and apply the function
df_1grouped = df1.groupby(['Election_Period', 'Session'])
df_1truncated = df_1grouped.apply(truncate_after_session_closed).reset_index(drop=True)

# Convert not_found_list to a DataFrame
not_found_df = pd.DataFrame(not_found_list)

print(not_found_df)

     Election_Period Session
0                 15     001
1                 15     002
2                 15     003
3                 15     004
4                 15     005
...              ...     ...
1269              20     128
1270              20     129
1271              20     130
1272              20     131
1273              20     132

[1274 rows x 2 columns]


In [152]:
df2 = df_1truncated.copy()

In [153]:
df1.shape[0]-df2.shape[0]

1314

In [154]:
df2.isna().sum()

Election_Period          0
Session                  0
Date                     0
Start               864202
End_Time            150500
Speaker                  0
Text_Spoken              0
Reactions          4322956
dtype: int64

# Start

## Find Missing Start Time

In [155]:
# Filter rows where the Start column is missing in df_1
missing_start_df = df2[df2['Start'].isna()]

# Group by Wahlperiode and Sitzung and count the number of missing starts for each group
grouped_missing_starts = missing_start_df.groupby(['Election_Period', 'Session']).size().reset_index(name='Missing_Count')

# Display the result
print(grouped_missing_starts)

    Election_Period Session  Missing_Count
0                18     124           7722
1                18     125           3743
2                18     126           2438
3                18     127           6675
4                18     128           3283
..              ...     ...            ...
172              19     104           9760
173              19     105           5218
174              19     107          11118
175              19     108           4939
176              19     109            694

[177 rows x 3 columns]


In [156]:
# Function to extract the start time
def extract_start(path):
    pdf_text = extract_text_from_pdf(path)
    start_time = None
    
    # Search for the word "Beginn:"
    start_idx = pdf_text.find("Beginn")
    
    if start_idx != -1:
        # Extract the following text (assuming time format as HH:MM, hence taking 5 characters)
        start_time = pdf_text[start_idx + len("Beginn"): start_idx + len("Beginn") + 5].strip()
        
    return start_time

In [157]:
# Step 1: Identify missing rows
missing_start_tuples = grouped_missing_starts[['Election_Period', 'Session']].apply(tuple, axis=1)

# Initialize a dictionary to hold the new Start times
new_starts = {}

# Step 2: Loop through PDFs
for election_period, session in missing_start_tuples:
    pdf_path = f"/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/1998_2023/{election_period}{session}.pdf"  # Replace with your actual folder path
    
    try:
        # Extract Start time
        time = extract_start(pdf_path)
        
        if time:  # Check if time is not None or empty
            new_starts[(election_period, session)] = time
    except Exception as e:
        print(f"An error occurred while processing {pdf_path}. Error: {e}")

In [158]:
new_starts

{('18', '124'): ': 9',
 ('18', '125'): ': 9',
 ('18', '126'): ': 13',
 ('18', '127'): ': 9',
 ('18', '128'): ': 9',
 ('18', '129'): ': 13',
 ('18', '131'): ': 9',
 ('18', '132'): ': 13',
 ('18', '133'): ': 9',
 ('18', '135'): ': 13',
 ('18', '137'): ': 9',
 ('18', '138'): ': 10',
 ('18', '139'): ': 9',
 ('18', '140'): ': 9',
 ('18', '142'): 'der',
 ('18', '143'): ': 9',
 ('18', '144'): ': 9',
 ('18', '145'): ': 13',
 ('18', '146'): ': 9',
 ('18', '147'): ': 9',
 ('18', '148'): ': 13',
 ('18', '150'): ': 9',
 ('18', '151'): ': 13',
 ('18', '152'): ': 9',
 ('18', '153'): ': 9',
 ('18', '155'): ': 9',
 ('18', '156'): ': 9',
 ('18', '158'): ': 9',
 ('18', '159'): ': 9',
 ('18', '161'): ': 9',
 ('18', '162'): ': 9',
 ('18', '163'): ': 13',
 ('18', '165'): ': 9',
 ('18', '166'): ': 13',
 ('18', '167'): ': 9',
 ('18', '168'): ': 9',
 ('18', '169'): ': 13',
 ('18', '170'): ': 9',
 ('18', '171'): ': 9',
 ('18', '172'): ': 13',
 ('18', '173'): ': 9',
 ('18', '175'): ': 13',
 ('18', '176'): ': 9'

In [159]:
len(new_starts)

177

In [160]:
# Step 3: Update DataFrame
for (election_period, session), start_time in new_starts.items():
    df2.loc[(df2['Election_Period'] == election_period) & (df2['Session'] == session), 'Start'] = start_time

## Drop All Before Beginn

In [161]:
def truncate_after_session_closed(group):
    # Pattern to match variations of "Die Sitzung ist geschlossen."
    pattern = re.compile(r"Beginn", re.IGNORECASE)
    
    # Find index where pattern matches
    idx_start = group[group['Text_Spoken'].apply(lambda x: bool(pattern.search(str(x))))].index
    
    # If pattern not found, append to not_found_list
    if len(idx_start) == 0:
        not_found_list.append({'Election_Period': group['Election_Period'].iloc[0], 'Session': group['Session'].iloc[0]})
    else:
        # Select the first index where the pattern is found and add one
        idx_start = idx_start[0] + 1
        # Return the truncated group starting from idx_start to the end
        return group.loc[idx_start:]
    
    # If pattern not found, return the group unmodified
    return group

# Group by 'Election_Period' and 'Session' and apply the function
df_2grouped = df2.groupby(['Election_Period', 'Session'])
df_2truncated = df_2grouped.apply(truncate_after_session_closed).reset_index(drop=True)

# Convert not_found_list to a DataFrame
not_found_df = pd.DataFrame(not_found_list)

print(not_found_df)

     Election_Period Session
0                 15     001
1                 15     002
2                 15     003
3                 15     004
4                 15     005
...              ...     ...
1277              16     232
1278              17     185
1279              18     035
1280              18     223
1281              19     168

[1282 rows x 2 columns]


In [162]:
df3 = df_2truncated.copy()

In [163]:
df3.shape

(4864724, 8)

In [164]:
df3.isna().sum()

Election_Period          0
Session                  0
Date                     0
Start                    0
End_Time            146477
Speaker                  0
Text_Spoken              0
Reactions          4105647
dtype: int64

# Datatpyes

In [85]:
df3.dtypes

Election_Period    object
Session            object
Date               object
Start              object
End_Time           object
Speaker            object
Text_Spoken        object
Reactions          object
dtype: object

## Wahlperiode & Sitzung

In [165]:
df3['Election_Period'] = df3['Election_Period'].astype(int)
df3['Session'] = df3['Session'].astype(int)

In [166]:
df4 = df3.copy()

## Date

In [167]:
import calendar

In [168]:
# Create a mapping of German month names to English
german_to_english = {
    'Januar': 'January',
    'Februar': 'February',
    'März': 'March',
    'April': 'April',
    'Mai': 'May',
    'Juni': 'June',
    'Juli': 'July',
    'August': 'August',
    'September': 'September',
    'Oktober': 'October',
    'November': 'November',
    'Dezember': 'December'
}

In [169]:
# Replace German month names with English month names
df4['Date'] = df4['Date'].replace(german_to_english, regex=True)

# Now convert to datetime
df4['Temp_Date'] = pd.to_datetime(df4['Date'], format='%d. %B %Y', errors='coerce')

In [170]:

# Create a boolean mask for entries where the conversion failed
mask = df4['Temp_Date'].isna()

# Group by the original 'Date' column and additional columns for rows where the conversion failed
problematic_dates = df4[mask].groupby(['Election_Period', 'Session', 'Date']).size().reset_index(name='Count')

# Sort by 'Count' to easily spot frequently occurring problematic entries
problematic_dates = problematic_dates.sort_values(by='Count', ascending=False)

# Drop the temporary 'Temp_Date' column
df4.drop(columns=['Temp_Date'], inplace=True)

# Print out the problematic dates along with 'Election_Period' and 'Session'
print("Counts of problematic entries by Wahlperiode, Sitzung, and Date:")
print(problematic_dates)

Counts of problematic entries by Wahlperiode, Sitzung, and Date:
   Election_Period  Session        Date  Count
0               15       16  12. und 13   4243


In [171]:
# Manually update the Date here
df4.loc[(df4['Election_Period'] == 15) & (df4['Session'] == 16), 'Date'] = '19. December 2002'

In [172]:
df4['Date'] = pd.to_datetime(df4['Date'], format='%d. %B %Y')

In [173]:
df5 = df4.copy()

## Start

In [174]:
# First inspect how Start can looks like I think something like this
# dd needs to be lower than 24
# dd dd
# letters

In [175]:
# Strip
df5['Start'] = df5['Start'].str.strip()
df5['Start'] = df5['Start'].str.replace(r'\s+', '', regex=True)

In [176]:
def clean_start_time(t):
    # Check for any non-digit characters
        # Remove all whitespaces from the input
    t = re.sub(r'\s+', '', t)
    
    # Remove all characters except digits and colon
    t = re.sub(r'[^\d:]', '', t)
    if re.search('[a-zA-Z]', t):
        return None
    
    elif re.match(r'^:\d{1}$', t):
        return f"0{t[1]}:00:00"
        # Handle cases like "13:45" or 9:33 :9
    
    elif re.match(r'^:\d{2}$', t):
        return f"{t[1]}:00:00"
    
    elif ":" in t:
        parts = t.split(':')
        if len(parts) == 2:
            # Check if first part is empty and second part is a single digit
            if parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 1:
                return f"0{parts[1]}:00:00"
            elif parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 2:
                return f"{parts[1]}:00:00"
            # Existing checks for parts with length of 2 each
            elif all(part.isdigit() for part in parts):
                hours, minutes = parts
                if int(hours) < 24 and int(minutes) < 60:
                    return f"{hours.zfill(2)}:{minutes.zfill(2)}:00"
    
    # Length of 4 indicates HHMM format
    if len(t) == 4:
        return f"{t[:2]}:{t[2:]}:00"
    
    # Length of 3 indicates HMM format
    elif len(t) == 3:
        # Check if minutes exceed 59
        if int(t[1:]) < 60:
            return f"0{t[0]}:{t[1:]}:00"
        elif int(t[:1]) < 24:
            return f"0{t[:1]}:{t[2]}:00"
    
    # Length of 2 could be HH format
    elif len(t) == 2:
        # Check if it's a valid hour
        if int(t) <= 24:
            return f"{t}:00:00"

    # Length of 1 indicates a single digit hour
    elif len(t) == 1:
        return f"{t}:00:00"

    # Handle cases like "13 45"
    elif " " in t:
        hours, minutes = t.split()
        if int(hours) <= 24 and int(minutes) < 60:
            return f"{int(hours)}:{int(minutes)}:00"
        
    else:
        # If none of the above conditions met, return None
        return None

In [177]:
# Apply the function to the Start column
df5['Start'] = df5['Start'].apply(lambda x: clean_start_time(str(x)))

In [178]:
# Now convert the cleaned times to timedelta
df5['Start'] = pd.to_timedelta(df5['Start'].dropna())

In [179]:
df5.sample(10)

Unnamed: 0,Election_Period,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions
667546,16,51,2006-09-21,0 days 09:01:00,20:05,Michael Glos,Das Prognosespektrumfür die Konjunktur geht na...,
2890597,18,169,2016-05-11,0 days 01:00:00,16,Dr. Joachim Pfeiffer,"Dafür gibt es ein Ver handlungsmandat, das auc...","Heike Hänsel [DIE LINKE]: Ja, die Bundes regie..."
3254531,19,5,2017-12-13,0 days 01:00:00,19,Ursula Groden Kranich,Lassen Sie mich an diesem letzten Sitzungstag ...,
3854191,19,139,2020-01-15,0 days 13:00:00,19:28,Vizepräsident Wolfgang Kubicki,"Vielen Dank, Herr Kollege Lucassen.",
4783088,20,115,2023-07-06,0 days 09:00:00,23:12,Hannes Gnauck,Und der Grund ist auch of fensichtlich: Der od...,Lachen des Abg. Manfred Todtenhausen [FDP].
3733144,19,111,2019-09-11,0 days 17:52:00,17:52,Katrin Göring Eckardt,"Das wäre die erste Erkenntnis, die man braucht.",
3014450,18,197,2016-10-21,0 days 09:00:00,34,Johann Saathoff,Liebe Kolleginnen und Kollegen!,
2415822,18,31,2014-04-11,0 days 09:01:00,13:11,Swen Schulz,"Ich will noch ein Thema ansprechen, dessen wir...",
3626210,19,90,2019-03-22,0 days 09:00:00,15,Friedrich Straetmanns,In diesem Sinne begrüßen wir die Schaffung ein...,
752514,16,78,2007-01-31,0 days 13:00:00,16:47,Vizepräsident Dr.Hermann Otto Solms,Wir kommen zur Frage15 des Kollegen Dr Edmund ...,


In [180]:
df6 = df5.copy()

## Schluss

In [181]:
def clean_end_time(t):    # Check for any non-digit characters
        # Remove all whitespaces from the input
    t = re.sub(r'\s+', '', t)
    
    # Remove all characters except digits and colon
    t = re.sub(r'[^\d:]', '', t)
    if re.search('[a-zA-Z]', t):
        return None
    
    elif re.match(r'^:\d{1}$', t):
        return f"0{t[1]}:00:00"
        # Handle cases like "13:45" or 9:33 :9
    
    elif re.match(r'^:\d{2}$', t):
        return f"{t[1]}:00:00"
    
    elif ":" in t:
        parts = t.split(':')
        if len(parts) == 2:
            # Check if first part is empty and second part is a single digit
            if parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 1:
                return f"0{parts[1]}:00:00"
            elif parts[0] == '' and parts[1].isdigit() and len(parts[1]) == 2:
                return f"{parts[1]}:00:00"
            # Existing checks for parts with length of 2 each
            elif all(part.isdigit() for part in parts):
                hours, minutes = parts
                if int(hours) < 24 and int(minutes) < 60:
                    return f"{hours.zfill(2)}:{minutes.zfill(2)}:00"
    
    # Length of 4 indicates HHMM format
    if len(t) == 4:
        return f"{t[:2]}:{t[2:]}:00"
    
    # Length of 3 indicates HMM format
    elif len(t) == 3:
        # Check if minutes exceed 59
        if int(t[1:]) < 60:
            return f"0{t[0]}:{t[1:]}:00"
        elif int(t[:1]) < 24:
            return f"0{t[:1]}:{t[2]}:00"
    
    # Length of 2 could be HH format
    elif len(t) == 2:
        # Check if it's a valid hour
        if int(t) <= 24:
            return f"{t}:00:00"

    # Length of 1 indicates a single digit hour
    elif len(t) == 1:
        return f"{t}:00:00"

    # Handle cases like "13 45"
    elif " " in t:
        hours, minutes = t.split()
        if int(hours) <= 24 and int(minutes) < 60:
            return f"{int(hours)}:{int(minutes)}:00"
        
    else:
        # If none of the above conditions met, return None
        return None

In [182]:
# Apply the function to the Start column
df6['End_Time'] = df6['End_Time'].apply(lambda x: clean_end_time(str(x)))

In [183]:
# Now convert the cleaned times to timedelta
df6['End_Time'] = pd.to_timedelta(df6['End_Time'].dropna())

In [184]:
df6.head()

Unnamed: 0,Election_Period,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions
0,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Meine sehr verehrten Damen und sehr geehrten H...,
1,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Von Paul Löbeüber Konrad Adenauerbis hin zu al...,
2,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Nur Willy Brandtwar 1983 acht Monate jünger, a...",
3,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Das Amt des Alterspräsiden ten blieb Willy Bra...,
4,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Den Hinweis daraufsollten Sie, was meine Leben...",Heiterkeit bei der SPD und dem BÜND NIS 90/DIE...


In [185]:
df7 = df6.copy()

# Impute Missing Values

In [186]:
df7.isna().sum()

Election_Period          0
Session                  0
Date                     0
Start                 9004
End_Time            159060
Speaker                  0
Text_Spoken              0
Reactions          4105647
dtype: int64

## Start

In [187]:
# Filter rows where the Start column is missing in df_1
missing_start_df = df7[df7['Start'].isna()]

# Group by Wahlperiode and Sitzung and count the number of missing starts for each group
grouped_missing_starts = missing_start_df.groupby(['Election_Period', 'Session']).size().reset_index(name='Missing_Count')

# Display the result
print(grouped_missing_starts)

   Election_Period  Session  Missing_Count
0               18      142           2157
1               19        6           1385
2               19       70           2486
3               19       82           2976


In [188]:
# Let me do Start by Hand
manual_starts = {
    (18, 142): '12:30:00',
    (19, 6): '13:00:00',
    (19, 70): '13:00:00',
    (19, 82): '13:00:00',
}

In [189]:
# Iterate over the manual_starts items
for (election_period, session), start_time in manual_starts.items():
    # Convert start_time to a timedelta object
    start_time_delta = pd.to_timedelta(start_time)
    print(start_time_delta)
    # Apply the timedelta where conditions are met
    df7.loc[(df7['Election_Period'] == election_period) & (df7['Session'] == session), 'Start'] = start_time_delta

0 days 12:30:00
0 days 13:00:00
0 days 13:00:00
0 days 13:00:00


In [190]:
df7.isna().sum()

Election_Period          0
Session                  0
Date                     0
Start                    0
End_Time            159060
Speaker                  0
Text_Spoken              0
Reactions          4105647
dtype: int64

In [191]:
df8 = df7.copy()

## Schluss

In [192]:
df8.isna().sum()

Election_Period          0
Session                  0
Date                     0
Start                    0
End_Time            159060
Speaker                  0
Text_Spoken              0
Reactions          4105647
dtype: int64

In [193]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Calculate the duration only for rows where 'End_Time' is not None
df8['Dauer'] = df8.apply(lambda row: row['End_Time'] - row['Start'] if pd.notna(row['End_Time']) else None, axis=1)

# Now convert 'Dauer' to seconds only for non-None values
df8['Dauer_in_Sekunden'] = df8['Dauer'].dt.total_seconds()

# Prepare your features and target variable, excluding rows where 'Dauer' is None
X = df8[['Election_Period', 'Session']]
y = df8['Dauer_in_Sekunden']

# Use only the rows with non-missing 'Dauer_in_Sekunden' to train the model
X_train = X[y.notna()]
y_train = y[y.notna()]

# Initialize the linear regression model and fit it to the data
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the 'Dauer_in_Sekunden' for missing values
X_missing = X[y.isna()]
print(X_missing)

         Election_Period  Session
217323                15       83
217324                15       83
217325                15       83
217326                15       83
217327                15       83
...                  ...      ...
3741134               19      112
3741135               19      112
3741136               19      112
3741137               19      112
3741138               19      112

[159060 rows x 2 columns]


In [194]:
predicted_durations = model.predict(X_missing)
predicted_durations

array([36555.16688178, 36555.16688178, 36555.16688178, ...,
       30464.01629555, 30464.01629555, 30464.01629555])

In [195]:

# Fill the missing 'Dauer_in_Sekunden' with the predicted values
df8.loc[y.isna(), 'Dauer_in_Sekunden'] = predicted_durations
df8.isna().sum()

Election_Period            0
Session                    0
Date                       0
Start                      0
End_Time              159060
Speaker                    0
Text_Spoken                0
Reactions            4105647
Dauer                 159060
Dauer_in_Sekunden          0
dtype: int64

In [196]:
missing_indices = df8.loc[df8['End_Time'].isna()].index
# Impute the missing 'End_Time' values using the predicted durations
df8.loc[missing_indices, 'End_Time'] = df8.loc[missing_indices, 'Start'] + pd.to_timedelta(df8.loc[missing_indices, 'Dauer_in_Sekunden'], unit='s')
df8.isna().sum()

Election_Period            0
Session                    0
Date                       0
Start                      0
End_Time                   0
Speaker                    0
Text_Spoken                0
Reactions            4105647
Dauer                 159060
Dauer_in_Sekunden          0
dtype: int64

In [197]:
# Clean up the DataFrame by dropping intermediate columns if desired
df8 = df8.drop(['Dauer', 'Dauer_in_Sekunden'], axis=1)

# View the updated DataFrame
df8.isna().sum()

Election_Period          0
Session                  0
Date                     0
Start                    0
End_Time                 0
Speaker                  0
Text_Spoken              0
Reactions          4105647
dtype: int64

In [198]:
df9 = df8.copy()

In [199]:
df9.dtypes

Election_Period              int64
Session                      int64
Date                datetime64[ns]
Start              timedelta64[ns]
End_Time           timedelta64[ns]
Speaker                     object
Text_Spoken                 object
Reactions                   object
dtype: object

In [200]:
df9.shape

(4864724, 8)

In [201]:
df9.isna().sum()

Election_Period          0
Session                  0
Date                     0
Start                    0
End_Time                 0
Speaker                  0
Text_Spoken              0
Reactions          4105647
dtype: int64

# Some final sentence cleaning

In [202]:
# Splitting each string in 'your_column' at the pattern and keeping only the part before it
df9['Text_Spoken'] = df9['Text_Spoken'].apply(lambda x: x.split('Geschiedenen Drucksachen')[0] + '.' if 'Geschiedenen Drucksachen' in x else x)
df9['Text_Spoken'] = df9['Text_Spoken'].apply(lambda x: x.split('Drucksachen')[0] + '.' if 'Geschiedenen Drucksachen' in x else x)
df9['Text_Spoken'] = df9['Text_Spoken'].replace('\d{2}\/\d{4}\,', '', regex=True)
df9['Text_Spoken'] = df9['Text_Spoken'].replace('\d{2}\/\d{4}', '', regex=True)

In [203]:
print('This worked')

This worked


In [204]:
df9.head()

Unnamed: 0,Election_Period,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions
0,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Meine sehr verehrten Damen und sehr geehrten H...,
1,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Von Paul Löbeüber Konrad Adenauerbis hin zu al...,
2,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Nur Willy Brandtwar 1983 acht Monate jünger, a...",
3,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Das Amt des Alterspräsiden ten blieb Willy Bra...,
4,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Den Hinweis daraufsollten Sie, was meine Leben...",Heiterkeit bei der SPD und dem BÜND NIS 90/DIE...


In [205]:
df9.to_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/Final_DF/final_text_speaker_df.csv', index=False, encoding='utf-8')