In [1]:
from germansentiment import SentimentModel
import pandas as pd
import os
# Set TOKENIZERS_PARALLELISM to false before importing the tokenizers
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import pipeline
from transformers import AutoTokenizer
from datetime import datetime

In [2]:
model = SentimentModel()

In [3]:
df = pd.read_csv("//Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/speeches_bundestag/Final_DF/combined_df_2.csv")

# Prepare dataframe

In [4]:
# Reset the index of the dataframe
df = df.reset_index(drop=True)

In [5]:
df.drop(columns=['Unnamed: 0', 'Wahlperiode_x', 'Fraktion_y', 'Wahlperiode_y'], inplace=True)

# Pipeline and function to process

In [6]:
sentiment_pipeline = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    top_k=None
)

In [7]:
#tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('lxyuan/distilbert-base-multilingual-cased-sentiments-student')

In [8]:
# This is the final one I can use!
# Assuming sentiment_pipeline is a predefined function that outputs the sentiment analysis in the list of dictionaries format

def split_into_chunks(text, chunk_size=512):
    # Split text into chunks of max_length only if it's longer than max_length
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] if len(text) > chunk_size else [text]

def get_sentiment(text, sentiment_pipeline):
    # Split the text into chunks if necessary and get sentiment for each chunk
    chunks = split_into_chunks(text)
    chunk_sentiments = [sentiment_pipeline(chunk) for chunk in chunks]
    return chunk_sentiments, chunks

def unpack_sentiments(sentiment_data):
    chunk_sentiments, chunks = sentiment_data
    # Calculate the total length of the text to determine the weights
    total_length = sum(len(chunk) for chunk in chunks)
    # Initialize a dictionary with default scores
    weighted_scores = {'positive': 0, 'negative': 0, 'neutral': 0}
    # Calculate weighted scores
    for sentiments, chunk in zip(chunk_sentiments, chunks):
        chunk_weight = len(chunk) / total_length
        for sentiment in sentiments[0]:
            label = sentiment['label'].lower()  # Convert label to lowercase to match the keys in scores
            # Add the weighted score
            weighted_scores[label] += sentiment['score'] * chunk_weight
    return pd.Series(weighted_scores)

def weighted_sentiment_analysis_3(df, sentiment_pipeline):
    # Apply the sentiment pipeline to the 'Text_Spoken' column, splitting into chunks if necessary
    df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))

    # Split the sentiment scores into separate columns with weighted averaging
    sentiments_df = df['Sentiment_Data'].apply(unpack_sentiments)
    df = df.join(sentiments_df)

    # Drop the 'Sentiment_Data' column as it's no longer needed
    df.drop(columns=['Sentiment_Data'], inplace=True)

    return df

# Use chunks of 100 000 sentences and combine them later

In [None]:
# Split the dataframe into chunks of 100000 lines
for i in range(1100000, len(df), 100000):
    print(f'Started chunk: {i}')
    now = datetime.now()
    starting_time = now.strftime("%H:%M:%S")
    print("Starting Time =", starting_time)
    chunk = df[i:i+10000]
    chunk = weighted_sentiment_analysis_3(chunk, sentiment_pipeline)
    print(f'Writing chunk: {i}')
    chunk.to_csv(f'/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/sentiment_dataframes/chunk_{i}.csv')
    print(f'Successfully finished chunk: {i}')
    now = datetime.now()
    ending_time = now.strftime("%H:%M:%S")
    print("Ending Time =", ending_time)

Started chunk: 1100000
Starting Time = 11:44:43


In [None]:
# Combine the split dataframes back together
combined_df = pd.read_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/sentiment_dataframes/*.csv')
combined_df

In [None]:
sentiment_df = combined_df.copy()

In [None]:
sentiment_df.to_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/speeches_bundestag/Final_DF/sentiment_analysis_all.csv', index=False)