In [69]:
import pandas as pd
from openai import OpenAI
import glob
import os
from wandb.sdk.data_types.trace_tree import Trace
import wandb
import configparser
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
import sys
import os
import inspect
import json
import random
from collections import Counter
import ast
import math
# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
config = configparser.ConfigParser()
# Read the configuration file
config.read('config.ini')
api_key_openai = config.get('credentials', 'api_key_openai')
api_key_mistral = config.get('credentials', 'api_key_mistral')
surfdrive_url_input_sentences = config.get('credentials', 'surfdrive_url_input_sentences')
surfdrive_url_prompts = config.get('credentials', 'surfdrive_url_prompts')
output_parallel_data = 'output_parallel_data/'
output_llm_folder_path = 'output_llm_data/'

### Create the context string

In [83]:
context_dict = {
    "U0": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U1": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U2": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U3": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U4": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U5": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U6": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U7": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U8": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"],
    "U9": ["word_distribution","adjectives","functional_words","punctuations","emojis","style_mistral","style_gpt"]
}

context_dict["U0"] = {}
context_dict["U1"] = {}
context_dict["U2"] = {}
context_dict["U3"] = {}
context_dict["U4"] = {}
context_dict["U5"] = {}
context_dict["U6"] = {}
context_dict["U7"] = {}
context_dict["U8"] = {}
context_dict["U9"] = {}

output_chat_data = "output_chat_data/"
csv_files = glob.glob(output_chat_data + '*')
for file in csv_files:
    username = file[17:19]
    
    if(file.find('word_distribution') != -1):
        df_words = pd.read_csv(file) 
        tuples_list = [(length, freq) for length, freq in zip(df_words['word_length'], df_words['percentage'])]
        word_count_distribution = str(tuples_list)
        context_dict[username]['word_distribution'] = word_count_distribution
        # print('Word frequency: ', word_count_distribution, '\n\n')

    elif(file.find('chat_llm') != -1):
        df_chat = pd.read_csv(file)

        # GET ADJECTIVES LIST
        adjectives_lists = [ast.literal_eval(keyword_str) for keyword_str in list(df_chat['adjectives'])]
        # Merge all lists into one
        all_adjectives = [keyword for sublist in adjectives_lists for keyword in sublist]
        adjectives_df = pd.DataFrame({'adjective': all_adjectives})
        # Count the frequency of each keyword
        adjectives_df = adjectives_df['adjective'].value_counts().reset_index()
        # Rename the columns
        adjectives_df.columns = ['adjective', 'frequency']
        total_keywords = adjectives_df['frequency'].sum()
        adjectives_df['percentage'] = round((adjectives_df['frequency'] / total_keywords) * 100,2)
        tuples_list = [(length, freq) for length, freq in zip(adjectives_df['adjective'], adjectives_df['percentage'])]
        adjective_distribution = str(tuples_list)
        context_dict[username]['adjectives'] = adjective_distribution
        # print('Adjectives: ',adjective_distribution, '\n\n')

        
        # GET FUNCTIONAL WORDS
        functional_words_lists = [ast.literal_eval(keyword_str) for keyword_str in list(df_chat['fuctional_words'])]
        # Merge all lists into one
        all_functional_words = [keyword for sublist in functional_words_lists for keyword in sublist]
        functional_words_df = pd.DataFrame({'functional_words': all_functional_words})
        # Count the frequency of each keyword
        functional_words_df = functional_words_df['functional_words'].value_counts().reset_index()
        # Rename the columns
        functional_words_df.columns = ['functional_words', 'frequency']
        total_keywords = functional_words_df['frequency'].sum()
        functional_words_df['percentage'] = round((functional_words_df['frequency'] / total_keywords) * 100,2)
        tuples_list = [(length, freq) for length, freq in zip(functional_words_df['functional_words'], functional_words_df['percentage'])]
        functional_words_distribution = str(tuples_list)
        context_dict[username]['functional_words'] = functional_words_distribution
        # print('Functional Words: ',functional_words_distribution, '\n\n')

        # GET PUNCTUATION
        punctuation_lists = [ast.literal_eval(keyword_str) for keyword_str in list(df_chat['punctuations'])]
        # Merge all lists into one
        all_punctuation = [keyword for sublist in punctuation_lists for keyword in sublist]
        punctuation_df = pd.DataFrame({'punctuation': all_punctuation})
        # Count the frequency of each keyword
        punctuation_df = punctuation_df['punctuation'].value_counts().reset_index()
        # Rename the columns
        punctuation_df.columns = ['punctuation', 'frequency']
        total_keywords = punctuation_df['frequency'].sum()
        punctuation_df['percentage'] = round((punctuation_df['frequency'] / total_keywords) * 100,2)
        tuples_list = [(length, freq) for length, freq in zip(punctuation_df['punctuation'], punctuation_df['percentage'])]
        punctuation_distribution = str(tuples_list)
        context_dict[username]['punctuations'] = punctuation_distribution
        # print('Punctuation: ',punctuation_distribution, '\n\n')

        # GET EMOJIS
        all_emojis = list(df_chat['emojis'])
        all_emojis = [x for x in all_emojis if x == x]
        separated_emojis = []
        for emoji in all_emojis:
            # Check if the emoji is a combination of emojis or an emoticon
            if not all(ord(char) < 128 for char in emoji):
                # If it's a combination of emojis, split them
                separated_emojis.extend(list(emoji))
            else:
                # If it's an emoticon, keep it as is
                separated_emojis.append(emoji)
       
        emojis_df = pd.DataFrame({'emojis': separated_emojis})
        # Count the frequency of each keyword
        emojis_df = emojis_df['emojis'].value_counts().reset_index()
        # Rename the columns
        emojis_df.columns = ['emojis', 'frequency']
        total_keywords = emojis_df['frequency'].sum()
    
        print('Rows',df_chat.shape[0])
        print('emojis_df ', emojis_df)
        emojis_df['percentage'] = round((emojis_df['frequency'] / total_keywords) * 100,2)
        tuples_list = [(length, freq) for length, freq in zip(emojis_df['emojis'], emojis_df['percentage'])]
        emojis_distribution = str(tuples_list)
        context_dict[username]['emojis'] = emojis_distribution
        print(emojis_distribution)

        # Should we do the percentage relative to all messages or to the total number of X column values (like it is now).
        # Ex. think about this: someone used only 2 emojies, while another person 20. 

    # STYLE KEYWORDS
    elif(file.find('mistral') != -1):
        df_mistral = pd.read_csv(file) 
        tuples_list = [(length, round(freq,2)) for length, freq in zip(df_mistral['keyword'], df_mistral['percentage'])]
        word_count_distribution = str(tuples_list)
        context_dict[username]['style_mistral'] = word_count_distribution
        # print('Mistral Style Keywords: ', word_count_distribution, '\n\n')
    elif(file.find('gpt') != -1):
        df_gpt = pd.read_csv(file) 
        tuples_list = [(length, round(freq,2)) for length, freq in zip(df_gpt['keyword'], df_gpt['percentage'])]
        word_count_distribution = str(tuples_list)
        context_dict[username]['style_gpt'] = word_count_distribution
        # print('GPT Style Keywords: ', word_count_distribution, '\n\n')
        

Rows 40
emojis_df    emojis  frequency
0      🥲          2
1      😂          1
[('🥲', 66.67), ('😂', 33.33)]
Rows 42
emojis_df    emojis  frequency
0     :)          1
[(':)', 100.0)]
Rows 125
emojis_df  Empty DataFrame
Columns: [emojis, frequency]
Index: []
[]
Rows 153
emojis_df     emojis  frequency
0       😂          8
1      :)          2
2       🙈          2
3       😊          1
4       🥰          1
5       😱          1
6       😬          1
7       🥹          1
8       😭          1
9       🥲          1
10      🥳          1
[('😂', 40.0), (':)', 10.0), ('🙈', 10.0), ('😊', 5.0), ('🥰', 5.0), ('😱', 5.0), ('😬', 5.0), ('🥹', 5.0), ('😭', 5.0), ('🥲', 5.0), ('🥳', 5.0)]
Rows 122
emojis_df    emojis  frequency
0      😂          5
1     :           2
2     :/          1
3      💆          1
[('😂', 55.56), (': ', 22.22), (':/', 11.11), ('💆', 11.11)]
Rows 135
emojis_df    emojis  frequency
0     ;)         10
1      🙂          6
2     :)          4
3     :D          3
4      🙁          2
5      🤫   

In [65]:
emojis = ['😂😩', '😊', '👍🏻', '❤️', '🎉🎊', '😁']

# Separate emojis that are together in a single element
separated_emojis = []

for emoji in emojis:
    separated_emojis.extend(list(emoji))

print(separated_emojis)

['😂', '😩', '😊', '👍', '🏻', '❤', '️', '🎉', '🎊', '😁']


In [84]:
context_dict

{'U0': {'style_gpt': "[('informal', 18.95), ('casual', 15.79), ('colloquial', 8.95), ('conversational', 6.84), ('humorous', 6.32), ('personal', 4.21), ('friendly', 3.68), ('direct', 3.16), ('slang', 2.11), ('reflective', 1.58), ('concise', 1.58), ('abbreviated', 1.58), ('inquisitive', 1.05), ('emotive', 1.05), ('apologetic', 1.05), ('uncertain', 1.05), ('indirect', 1.05), ('self-deprecating', 1.05), ('expressive', 1.05), ('informative', 1.05), ('questioning', 1.05), ('enthusiastic', 1.05), ('interactive', 0.53), ('hesitant', 0.53), ('opinionated', 0.53), ('reassuring', 0.53), ('adventurous', 0.53), ('spontaneous', 0.53), ('considerate', 0.53), ('self-referential', 0.53), ('repetitive', 0.53), ('active', 0.53), ('professional', 0.53), ('clear', 0.53), ('internet language', 0.53), ('pragmatic', 0.53), ('explorative', 0.53), ('formal', 0.53), ('moderate', 0.53), ('descriptive', 0.53), ('storytelling', 0.53), ('disorganized', 0.53), ('factual', 0.53), ('neutral', 0.53), ('open-minded', 0.5