In [1]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk import *
from nltk.text import Text
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import FreqDist

In [2]:
file_path = "ESConv.json"

with open(file_path, "r", encoding="utf-8") as file:
    dataset = json.load(file)

In [3]:
dataset[0]

{'dialog': [{'annotation': {}, 'content': 'Hello\n', 'speaker': 'seeker'},
  {'annotation': {'strategy': 'Question'},
   'content': 'Hello, what would you like to talk about?',
   'speaker': 'supporter'},
  {'annotation': {},
   'content': 'I am having a lot of anxiety about quitting my current job. It is too stressful but pays well\n',
   'speaker': 'seeker'},
  {'annotation': {'strategy': 'Question'},
   'content': 'What makes your job stressful for you?',
   'speaker': 'supporter'},
  {'annotation': {'feedback': '5'},
   'content': 'I have to deal with many people in hard financial situations and it is upsetting \n',
   'speaker': 'seeker'},
  {'annotation': {'strategy': 'Question'},
   'content': 'Do you help your clients to make it to a better financial situation?',
   'speaker': 'supporter'},
  {'annotation': {},
   'content': 'I do, but often they are not going to get back to what they want. Many people are going to lose their home when safeguards are lifted \n',
   'speaker': '

In [4]:
# Extract conversations from the corpus (columns: index, dialog -> type(object -> list, dict))
corpus_data = []
for corpus in dataset:
    dialog = corpus['dialog']
    corpus_data.append(dialog)

df = pd.DataFrame({'Dialog': pd.Series(corpus_data)})

df

Unnamed: 0,Dialog
0,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
1,"[{'speaker': 'supporter', 'annotation': {'stra..."
2,"[{'speaker': 'supporter', 'annotation': {'stra..."
3,"[{'speaker': 'supporter', 'annotation': {'stra..."
4,"[{'speaker': 'supporter', 'annotation': {'stra..."
5,"[{'speaker': 'supporter', 'annotation': {'stra..."
6,"[{'speaker': 'supporter', 'annotation': {'stra..."
7,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
8,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
9,"[{'speaker': 'supporter', 'annotation': {'stra..."


In [5]:
df.loc[2]

Dialog    [{'speaker': 'supporter', 'annotation': {'stra...
Name: 2, dtype: object

In [6]:
# Extract the seeker conversations only from the corpus (columns: index, seeker dialog -> type(string))
seeker_df = pd.DataFrame(columns=['ConversationID', 'Seeker Dialog'])
for index, row in df.iterrows():
    dialog_list = row['Dialog']
    for dialog in dialog_list:
        if dialog['speaker'] == 'seeker':
            seeker_df = seeker_df.append({'ConversationID': index, 'Seeker Dialog': dialog['content']}, ignore_index=True)            
seeker_df

Unnamed: 0,ConversationID,Seeker Dialog
0,0,Hello\n
1,0,I am having a lot of anxiety about quitting my...
2,0,I have to deal with many people in hard financ...
3,0,"I do, but often they are not going to get back..."
4,0,That is true but sometimes I feel like I shoul...
5,0,Probably not. I was with the same company for ...
6,0,I could try. It mostly gets to me at the end o...
7,0,That is also true. Sometimes I wonder if it re...
8,0,That is true. Maybe I just need to sit down an...
9,0,It really is a big decision \n


In [7]:
seeker_df['Seeker Dialog'].dtype # object

dtype('O')

In [8]:
print(type(seeker_df['Seeker Dialog'].iloc[0]))

<class 'str'>


In [9]:
print(seeker_df['Seeker Dialog'].apply(type).value_counts())

<class 'str'>    19989
Name: Seeker Dialog, dtype: int64


In [10]:
def lexical_diversity(text_list):
    all_words = []
    for text in text_list:
        words = text.split()
        all_words.extend(words)
    word_count = len(all_words)
    unique_words = len(set(all_words))
    return unique_words / word_count

In [11]:
seeker_df['Lexical Diversity'] = seeker_df['Seeker Dialog'].apply(lexical_diversity)
seeker_df

Unnamed: 0,ConversationID,Seeker Dialog,Lexical Diversity
0,0,Hello\n,0.800000
1,0,I am having a lot of anxiety about quitting my...,0.333333
2,0,I have to deal with many people in hard financ...,0.318182
3,0,"I do, but often they are not going to get back...",0.233645
4,0,That is true but sometimes I feel like I shoul...,0.323077
5,0,Probably not. I was with the same company for ...,0.300000
6,0,I could try. It mostly gets to me at the end o...,0.428571
7,0,That is also true. Sometimes I wonder if it re...,0.388889
8,0,That is true. Maybe I just need to sit down an...,0.381818
9,0,It really is a big decision \n,0.681818


In [12]:
# Group the data by 'ConversationID' and calculate lexical diversity for each group
lexical_diversity_df = seeker_df.groupby('ConversationID')['Seeker Dialog'].apply(lambda x: lexical_diversity(x)).reset_index(name='Lexical Diversity')

lexical_diversity_df

Unnamed: 0,ConversationID,Lexical Diversity
0,0,0.647799
1,1,0.535326
2,2,0.520270
3,3,0.768595
4,4,0.561922
5,5,0.666667
6,6,0.733813
7,7,0.661417
8,8,0.718310
9,9,0.636735


In [13]:
# Apply the process_single_words function to the 'Seeker Dialog' column for each ConversationID
processed_dialogs = seeker_df.groupby('ConversationID')['Seeker Dialog'].apply(lambda x: ' '.join(x).split())
processed_dialogs

ConversationID
0       [Hello, I, am, having, a, lot, of, anxiety, ab...
1       [hello, im, looking, for, someone, to, talk, t...
2       [Hello, I'm, concerned, about, my, job., I, ha...
3       [I, am, dong, good., You?, I, have, been, stay...
4       [Infinitely, complicated., Too, many, decision...
5       [could, be, better, thank, you,, I, just, don'...
6       [hi!, I, am, good., you?, Yes, I, have, been, ...
7       [Hey, there, How, are, you?, I, am, ok,, I'm, ...
8       [I, need, some, tips, to, overcome, from, that...
9       [Hello., Not, doing, too, well., Bleak, and, d...
10      [Hey, You, there?, I'm, okay, I, guess., How, ...
11      [Hi!, I, am, trying, to, find, a, new, job, fo...
12      [Hi, Ok, I, guess, I, do, not, know, how, to, ...
13      [I'm,, okay,, thanks,, and, you?, I'm, feeling...
14      [Hi, Hope, you, are, doing, well, I, am, not, ...
15      [I, recently, lost, my, job, due, to, Covid, 1...
16      [Good, afternoon, I, am, not, good, at, all, N...

In [18]:
len(processed_dialogs[0])

159

In [15]:
def word_type(text):
    return len(set(text))

In [22]:
lexical_diversity_df['Word Type'] = seeker_df.groupby('ConversationID')['Seeker Dialog'].apply(word_type)
lexical_diversity_df['Dialog Length'] = seeker_df.groupby('ConversationID')['Seeker Dialog'].apply(len) # number od dialogues in each convesation
lexical_diversity_df['Total Words'] = seeker_df.groupby('ConversationID')['Seeker Dialog'].apply(lambda x: ' '.join(x).split()).apply(len)

In [23]:
lexical_diversity_df

Unnamed: 0,ConversationID,Lexical Diversity,Word Type,Dialog Length,Total Words
0,0,0.647799,13,13,159
1,1,0.535326,35,35,368
2,2,0.520270,18,18,592
3,3,0.768595,12,12,121
4,4,0.561922,51,51,541
5,5,0.666667,10,10,192
6,6,0.733813,16,16,139
7,7,0.661417,14,14,127
8,8,0.718310,9,9,71
9,9,0.636735,15,15,245
