In [30]:
!pip install nltk



In [31]:
import pandas as pd

df_raw = pd.read_csv('Dyadic_PELD.tsv', sep='\t', header=0)
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6510 entries, 0 to 6509
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Speaker_1    6510 non-null   object
 1   Speaker_2    6510 non-null   object
 2   Personality  6510 non-null   object
 3   Utterance_1  6510 non-null   object
 4   Utterance_2  6510 non-null   object
 5   Utterance_3  6510 non-null   object
 6   Emotion_1    6510 non-null   object
 7   Emotion_2    6510 non-null   object
 8   Emotion_3    6510 non-null   object
 9   Sentiment_1  6510 non-null   object
 10  Sentiment_2  6510 non-null   object
 11  Sentiment_3  6510 non-null   object
dtypes: object(12)
memory usage: 610.4+ KB


In [32]:

# Check the types of all columns in df_raw
print(df_raw.dtypes)

# Split df_raw into separate dataframes according to Speaker_1 column values
speaker_dfs = {}
for speaker in df_raw['Speaker_1'].unique():
    speaker_df = df_raw[df_raw['Speaker_1'] == speaker]
    speaker_dfs[speaker] = speaker_df

# Print the resulting dataframes
for speaker, df in speaker_dfs.items():
    print(f"Speaker: {speaker}, Rows: {len(df)}")

# Save each speaker's dataframe to a separate CSV file
for speaker, df in speaker_dfs.items():
    df.to_csv(f"{speaker}_data.csv", index=False)


Speaker_1      object
Speaker_2      object
Personality    object
Utterance_1    object
Utterance_2    object
Utterance_3    object
Emotion_1      object
Emotion_2      object
Emotion_3      object
Sentiment_1    object
Sentiment_2    object
Sentiment_3    object
dtype: object
Speaker: Chandler, Rows: 1085
Speaker: Joey, Rows: 1123
Speaker: Rachel, Rows: 1156
Speaker: Monica, Rows: 1051
Speaker: Phoebe, Rows: 972
Speaker: Ross, Rows: 1123


In [None]:
import nltk
from nltk.tokenize import sent_tokenize

# Preprocess your data (split into sentences)
speaker_dfs = {}
for speaker in df_raw['Speaker_1'].unique():
    speaker_df = df_raw[df_raw['Speaker_1'] == speaker]
    speaker_dfs[speaker] = speaker_df

for speaker, df in speaker_dfs.items():
    print(f"Speaker: {speaker}, Rows: {len(df)}")

    # Calculate and print vocabulary set, vocab size, and num of tokens for each speaker
    tokenized_text = []
    for utterance in df['Utterance_1']:
        sentences = sent_tokenize(utterance)
        for sentence in sentences:
            tokenized_text.append(word_tokenize(sentence))

    vocab_set = set()
    for utterance in tokenized_text:
        vocab_set.update(set(utterance))
    vocab_size = len(vocab_set)

    num_tokens = sum(len(utterance) for utterance in tokenized_text)

    print(f"Speaker: {speaker}, Vocabulary Set: {len(vocab_set)}, Vocab Size: {vocab_size}, Num Tokens: {num_tokens}\n")

    # Define confirmation and negation word lists
    confirmation_words = ['yes', 'yeah', 'yep', 'okay', 'alright', 'sure', 'absolutely', 'certainly', 'of course', 'indeed']
    negation_words = ['no', 'not', 'never', 'none', 'nor', 'nobody', 'neither', "don't", "isn't", "wasn't", "weren't", "doesn't", "won't", "haven't"]

    # Tokenize the text and count confirmation and negation words
    tokenized_text = [word_tokenize(utterance) for utterance in df['Utterance_1']]
    confirmation_count = sum(any(word in confirmation_words for word in utterance) for utterance in tokenized_text)
    negation_count = sum(any(word in negation_words for word in utterance) for utterance in tokenized_text)

    print(f"Speaker: {speaker}, Confirmation Count: {confirmation_count}, Negation Count: {negation_count}\n")

    # Print the tokenized text
    for i, utterance in enumerate(tokenized_text):
        print(f"Utterance {i+1}: {utterance}")
    
    print("\n")

# Save each speaker's dataframe to a separate CSV file
for speaker, df in speaker_dfs.items():
    df.to_csv(f"{speaker}_data.csv", index=False)


Speaker: Chandler, Rows: 1085
Speaker: Chandler, Vocabulary Set: 1801, Vocab Size: 1801, Num Tokens: 12505

Speaker: Chandler, Confirmation Count: 42, Negation Count: 93

Utterance 1: ['also', 'I', 'was', 'the', 'point', 'person', 'on', 'my', 'company', 's', 'transition', 'from', 'the', 'KL-5', 'to', 'GR-6', 'system', '.']
Utterance 2: ['That', 'I', 'did', '.', 'That', 'I', 'did', '.']
Utterance 3: ['My', 'duties', '?', 'All', 'right', '.']
Utterance 4: ['I', 'see', '.']
Utterance 5: ['Good', 'to', 'know', '.']
Utterance 6: ['No', 'don', 't', 'I', 'beg', 'of', 'you', '!']
Utterance 7: ['Hey', ',', 'Mon', '.']
Utterance 8: ['Do', 'I', 'ever', '.']
Utterance 9: ['No', 'way', '!']
Utterance 10: ['Just', 'coffee', '!', 'Where', 'are', 'we', 'gon', 'na', 'hang', 'out', 'now', '?']
Utterance 11: ['Can', 'I', 'get', 'a', 'beer', '.']
Utterance 12: ['You', 'betcha', '!']
Utterance 13: ['Good', 'job', 'Joe', '!', 'Well', 'done', '!', 'Top', 'notch', '!']
Utterance 14: ['Oh-ho-ho', ',', 'yeah', 

In [52]:
import numpy as np
import ast

# Define a function to extract the dominant personality
def get_dominant_personality(row):
    # Convert the string representation of the list to an actual list of floats
    trait_probabilities = ast.literal_eval(row['Personality'])
    max_trait_index = np.argmax(trait_probabilities)
    
    return max_trait_index

# Apply this function to each row in df_raw and assign result to 'Dominant Personality' column
df_raw['Dominant Personality'] = df_raw.apply(get_dominant_personality, axis=1)

# Print the value counts of the 'Dominant Personality' column
print(df_raw['Dominant Personality'].value_counts())

# Convert the values in the 'Dominant Personality' column to int and store them in a list
dominant_personalities = df_raw['Dominant Personality'].tolist()
dominant_personalities = [int(x) for x in dominant_personalities]

print(dominant_personalities)


Dominant Personality
0    5387
1    1123
Name: count, dtype: int64
[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,