In [1]:
from germansentiment import SentimentModel
import pandas as pd
import os
# Set TOKENIZERS_PARALLELISM to false before importing the tokenizers
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import pipeline
from transformers import AutoTokenizer
from datetime import datetime
import glob

In [2]:
model = SentimentModel()

In [5]:
df = pd.read_csv("/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Sentiment_In_Session/Final_DF/combined_df.csv")

In [6]:
df.head()

Unnamed: 0,Election_Period,Session,Date,Start,End_Time,Speaker,Text_Spoken,Reactions,Name,Party,Position
0,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Meine sehr verehrten Damen und sehr geehrten H...,,Otto Schily,SPD,Abgeordnete*r
1,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Von Paul Löbeüber Konrad Adenauerbis hin zu al...,,Otto Schily,SPD,Abgeordnete*r
2,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Nur Willy Brandtwar 1983 acht Monate jünger, a...",,Otto Schily,SPD,Abgeordnete*r
3,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,Das Amt des Alterspräsiden ten blieb Willy Bra...,,Otto Schily,SPD,Abgeordnete*r
4,15,1,2002-10-17,0 days 11:00:00,0 days 15:40:00,Alterspräsident Otto Schily,"Den Hinweis daraufsollten Sie, was meine Leben...",Heiterkeit bei der SPD und dem BÜND NIS 90/DIE...,Otto Schily,SPD,Abgeordnete*r


# Prepare dataframe

In [4]:
# Reset the index of the dataframe
df = df.reset_index(drop=True)

# Pipeline and function to process

In [6]:
sentiment_pipeline = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    top_k=None
)

In [7]:
#tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('lxyuan/distilbert-base-multilingual-cased-sentiments-student')

In [8]:
# This is the final one I can use!
# Assuming sentiment_pipeline is a predefined function that outputs the sentiment analysis in the list of dictionaries format

def split_into_chunks(text, chunk_size=512):
    # Split text into chunks of max_length only if it's longer than max_length
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] if len(text) > chunk_size else [text]

def get_sentiment(text, sentiment_pipeline):
    # Split the text into chunks if necessary and get sentiment for each chunk
    chunks = split_into_chunks(text)
    chunk_sentiments = [sentiment_pipeline(chunk) for chunk in chunks]
    return chunk_sentiments, chunks

def unpack_sentiments(sentiment_data):
    chunk_sentiments, chunks = sentiment_data
    # Calculate the total length of the text to determine the weights
    total_length = sum(len(chunk) for chunk in chunks)
    # Initialize a dictionary with default scores
    weighted_scores = {'positive': 0, 'negative': 0, 'neutral': 0}
    # Calculate weighted scores
    for sentiments, chunk in zip(chunk_sentiments, chunks):
        chunk_weight = len(chunk) / total_length
        for sentiment in sentiments[0]:
            label = sentiment['label'].lower()  # Convert label to lowercase to match the keys in scores
            # Add the weighted score
            weighted_scores[label] += sentiment['score'] * chunk_weight
    return pd.Series(weighted_scores)

def weighted_sentiment_analysis_3(df, sentiment_pipeline):
    # Apply the sentiment pipeline to the 'Text_Spoken' column, splitting into chunks if necessary
    df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))

    # Split the sentiment scores into separate columns with weighted averaging
    sentiments_df = df['Sentiment_Data'].apply(unpack_sentiments)
    df = df.join(sentiments_df)

    # Drop the 'Sentiment_Data' column as it's no longer needed
    df.drop(columns=['Sentiment_Data'], inplace=True)

    return df

# Use chunks of 100 000 sentences and combine them later

In [11]:
# Split the dataframe into chunks of 100000 lines
for i in range(3500000, len(df), 100000):
    print(f'Started chunk: {i}')
    now = datetime.now()
    starting_time = now.strftime("%H:%M:%S")
    print("Starting Time =", starting_time)
    chunk = df[i:i+100000]
    chunk = weighted_sentiment_analysis_3(chunk, sentiment_pipeline)
    print(f'Writing chunk: {i}')
    chunk.to_csv(f'/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/sentiment_dataframes/chunk_{i}.csv')
    print(f'Successfully finished chunk: {i}')
    now = datetime.now()
    ending_time = now.strftime("%H:%M:%S")
    print("Ending Time =", ending_time)

Started chunk: 3500000
Starting Time = 22:39:29


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 3500000
Successfully finished chunk: 3500000
Ending Time = 01:22:12
Started chunk: 3600000
Starting Time = 01:22:12


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 3600000
Successfully finished chunk: 3600000
Ending Time = 04:41:09
Started chunk: 3700000
Starting Time = 04:41:09


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 3700000
Successfully finished chunk: 3700000
Ending Time = 08:01:37
Started chunk: 3800000
Starting Time = 08:01:37


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 3800000
Successfully finished chunk: 3800000
Ending Time = 08:59:59
Started chunk: 3900000
Starting Time = 08:59:59


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 3900000
Successfully finished chunk: 3900000
Ending Time = 09:30:16
Started chunk: 4000000
Starting Time = 09:30:16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 4000000
Successfully finished chunk: 4000000
Ending Time = 10:00:47
Started chunk: 4100000
Starting Time = 10:00:47


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 4100000
Successfully finished chunk: 4100000
Ending Time = 10:31:50
Started chunk: 4200000
Starting Time = 10:31:50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 4200000
Successfully finished chunk: 4200000
Ending Time = 11:02:38
Started chunk: 4300000
Starting Time = 11:02:38


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 4300000
Successfully finished chunk: 4300000
Ending Time = 11:34:15
Started chunk: 4400000
Starting Time = 11:34:15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 4400000
Successfully finished chunk: 4400000
Ending Time = 12:06:03
Started chunk: 4500000
Starting Time = 12:06:03


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sentiment_Data'] = df['Text_Spoken'].apply(lambda text: get_sentiment(text, sentiment_pipeline))


Writing chunk: 4500000
Successfully finished chunk: 4500000
Ending Time = 12:15:27


In [13]:
# Path to the directory
path = '/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/sentiment_dataframes/'

# List all CSV files in the directory
all_files = glob.glob(path + "*.csv")

# List to hold dataframes
dfs = []

# Read each CSV file and append to the list
for filename in all_files:
    df = pd.read_csv(filename)
    dfs.append(df)

In [14]:
# Combine all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

In [15]:
sentiment_df = combined_df.copy()

In [16]:
sentiment_df.shape

(4530623, 15)

In [19]:
sentiment_df.isna().sum()

Unnamed: 0           0
Sitzung              0
Date                 0
Start                0
Schluss              0
Speaker              0
Text_Spoken          0
Reactions      3816713
Name                 0
Fraktion_x           0
Position             0
Wahlperiode          0
positive             0
negative             0
neutral              0
dtype: int64

In [20]:
sentiment_df.to_csv('/Users/merlesteffen/Documents/Education/WBS_Coding_School/Bootcamp/Final_Project/speeches_bundestag/Final_DF/sentiment_analysis_all.csv', index=False)