In [1]:
import json
import numpy as np
import pandas as pd

In [63]:
# Works for all types of entries
def create_sentences(segments_data, MIN_WORDS, MAX_WORDS):

  # Combine the non-sentences together
  sentences = []

  is_new_sentence = True
  sentence_length = 0
  sentence_num = 0
  sentence_segments = []

  for i in range(len(segments_data)):
    if is_new_sentence == True:
      is_new_sentence = False
    # Append the segment
    sentence_segments.append(segments_data[i]['text'])
    segment_words = segments_data[i]['text'].split(' ')
    sentence_length += len(segment_words)
    
    # If exceed MAX_WORDS, then stop at the end of the segment
    # Only consider it a sentence if the length is at least MIN_WORDS
    if (sentence_length >= MIN_WORDS and segments_data[i]['text'][-1] == '.') or sentence_length >= MAX_WORDS:
      sentence = ' '.join(sentence_segments)
      sentences.append({
        'sentence_num': sentence_num,
        'text': sentence,
        'sentence_length': sentence_length,
        'start_chunk_id': i - len(sentence_segments) + 1,
        'end_chunk_id': i,
        'start_time': segments_data[i - len(sentence_segments) + 1].get('start_time', None),
        'end_time': segments_data[i].get('end_time', None)
      })
      # Reset
      is_new_sentence = True
      sentence_length = 0
      sentence_segments = []
      sentence_num += 1

  return sentences

In [67]:
for i in range(1, 14):
  with open(f'{i}.json') as f:
    entry_data = json.load(f)
      
  chunks_data = entry_data['chunks']
  chunks = [ chunk['text'] for chunk in chunks_data ]

  chunks_df = pd.DataFrame(chunks_data)
  chunks_df['emotions'] = chunks_df['emotions'].apply(lambda x: np.array(x))
  chunks_df['index'] = chunks_df.index
  if 'start_time' in chunks_df.columns and 'end_time' in chunks_df.columns:
    chunks_df['duration'] = chunks_df['end_time'] - chunks_df['start_time']
  else:
    # get number of words of text
    chunks_df['duration'] = chunks_df['text'].apply(lambda x: len(x.split(' ')))

  # Combines chunks (some are too short to be meaningful) into sentences of 20-30 words
  sentences = create_sentences(chunks_data, MIN_WORDS=20, MAX_WORDS=30)
  sentences_df = pd.DataFrame(sentences)

  # For each sentence in sentences_df, create an emotions column, 
  # which is a weighted average of the emotions of the chunks that make up the sentence (start_chunk_id to end_chunk_id inclusive)
  # where the weights are the duration

  sentences_df['emotions'] = sentences_df.apply( lambda row: np.average(chunks_df.loc[row['start_chunk_id']:row['end_chunk_id']]['emotions'],
                                                                        weights=chunks_df.loc[row['start_chunk_id']:row['end_chunk_id']]['duration']), axis=1)

In [68]:
sentences

[{'sentence_num': 0,
  'text': "Yeah. Happy New year. Guess, I haven't driven million in a wall. Yeah we've got like ten more days two school starts.",
  'sentence_length': 22,
  'start_chunk_id': 0,
  'end_chunk_id': 1,
  'start_time': 0.6982051,
  'end_time': 6.903538},
 {'sentence_num': 1,
  'text': "I don't know how to feel about that. I think it's nice to be at home but I I miss... I'm miss my friends and I miss Kayla and Yeah. I...",
  'sentence_length': 31,
  'start_chunk_id': 2,
  'end_chunk_id': 5,
  'start_time': 8.454273,
  'end_time': 18.39785},
 {'sentence_num': 2,
  'text': 'Yeah. I think it just be nice to see K in. I, Acknowledge is nice you get your own space. You get your own room nice to hang out. And,',
  'sentence_length': 30,
  'start_chunk_id': 6,
  'end_chunk_id': 8,
  'start_time': 18.736076,
  'end_time': 28.313456},
 {'sentence_num': 3,
  'text': "yeah. I maybe spending too much time at home can also get a little bit dull. But it's it's mine. It's been nice. 

### Getting Hume Emotions

In [18]:
# Read json file
with open('../embeddings/audio1.json') as f:
  data = json.load(f)

In [51]:
emotions = data[0]['results']['predictions'][0]['models']['language']['grouped_predictions'][0]['predictions'][0]['emotions']
emotion_names = [e['name'] for e in emotions]

In [53]:
emotion_names = emotion_names[:48]

In [54]:
emotion_names

['Admiration',
 'Adoration',
 'Aesthetic Appreciation',
 'Amusement',
 'Anger',
 'Annoyance',
 'Anxiety',
 'Awe',
 'Awkwardness',
 'Boredom',
 'Calmness',
 'Concentration',
 'Confusion',
 'Contemplation',
 'Contempt',
 'Contentment',
 'Craving',
 'Desire',
 'Determination',
 'Disappointment',
 'Disapproval',
 'Disgust',
 'Distress',
 'Doubt',
 'Ecstasy',
 'Embarrassment',
 'Empathic Pain',
 'Enthusiasm',
 'Entrancement',
 'Envy',
 'Excitement',
 'Fear',
 'Gratitude',
 'Guilt',
 'Horror',
 'Interest',
 'Joy',
 'Love',
 'Nostalgia',
 'Pain',
 'Pride',
 'Realization',
 'Relief',
 'Romance',
 'Sadness',
 'Sarcasm',
 'Satisfaction',
 'Shame']