In [1]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk import *
from nltk.text import Text
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [2]:
file_path = "ESConv.json"

with open(file_path, "r", encoding="utf-8") as file:
    dataset = json.load(file)

In [3]:
dataset[0]

{'dialog': [{'annotation': {}, 'content': 'Hello\n', 'speaker': 'seeker'},
  {'annotation': {'strategy': 'Question'},
   'content': 'Hello, what would you like to talk about?',
   'speaker': 'supporter'},
  {'annotation': {},
   'content': 'I am having a lot of anxiety about quitting my current job. It is too stressful but pays well\n',
   'speaker': 'seeker'},
  {'annotation': {'strategy': 'Question'},
   'content': 'What makes your job stressful for you?',
   'speaker': 'supporter'},
  {'annotation': {'feedback': '5'},
   'content': 'I have to deal with many people in hard financial situations and it is upsetting \n',
   'speaker': 'seeker'},
  {'annotation': {'strategy': 'Question'},
   'content': 'Do you help your clients to make it to a better financial situation?',
   'speaker': 'supporter'},
  {'annotation': {},
   'content': 'I do, but often they are not going to get back to what they want. Many people are going to lose their home when safeguards are lifted \n',
   'speaker': '

In [4]:
# Extract conversations from the corpus (columns: index, dialog -> type(object -> list, dict))
corpus_data = []
for corpus in dataset:
    dialog = corpus['dialog']
    corpus_data.append(dialog)

df = pd.DataFrame({'Dialog': pd.Series(corpus_data)})

df

Unnamed: 0,Dialog
0,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
1,"[{'speaker': 'supporter', 'annotation': {'stra..."
2,"[{'speaker': 'supporter', 'annotation': {'stra..."
3,"[{'speaker': 'supporter', 'annotation': {'stra..."
4,"[{'speaker': 'supporter', 'annotation': {'stra..."
5,"[{'speaker': 'supporter', 'annotation': {'stra..."
6,"[{'speaker': 'supporter', 'annotation': {'stra..."
7,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
8,"[{'speaker': 'seeker', 'annotation': {}, 'cont..."
9,"[{'speaker': 'supporter', 'annotation': {'stra..."


In [5]:
df.loc[2]

Dialog    [{'speaker': 'supporter', 'annotation': {'stra...
Name: 2, dtype: object

In [36]:
# Extract the seeker conversations only from the corpus (columns: index, seeker dialog -> type(string))
seeker_df = pd.DataFrame(columns=['ConversationID', 'Seeker Dialog'])
for index, row in df.iterrows():
    dialog_list = row['Dialog']
    for dialog in dialog_list:
        if dialog['speaker'] == 'seeker':
            seeker_df = seeker_df.append({'ConversationID': index, 'Seeker Dialog': dialog['content']}, ignore_index=True)            
seeker_df

Unnamed: 0,ConversationID,Seeker Dialog
0,0,Hello\n
1,0,I am having a lot of anxiety about quitting my...
2,0,I have to deal with many people in hard financ...
3,0,"I do, but often they are not going to get back..."
4,0,That is true but sometimes I feel like I shoul...
5,0,Probably not. I was with the same company for ...
6,0,I could try. It mostly gets to me at the end o...
7,0,That is also true. Sometimes I wonder if it re...
8,0,That is true. Maybe I just need to sit down an...
9,0,It really is a big decision \n


In [37]:
seeker_df['Seeker Dialog'].dtype # object

dtype('O')

In [38]:
print(type(seeker_df['Seeker Dialog'].iloc[0]))

<class 'str'>


In [39]:
print(seeker_df['Seeker Dialog'].apply(type).value_counts())

<class 'str'>    19989
Name: Seeker Dialog, dtype: int64


In [40]:
def word_type(text):
    return len(set(text))

def lexical_diversity(text):
    return len(text) / word_type(text)

seeker_df['Lexical Diversity'] = seeker_df['Seeker Dialog'].apply(lexical_diversity)

In [41]:
seeker_df

Unnamed: 0,ConversationID,Seeker Dialog,Lexical Diversity
0,0,Hello\n,1.200000
1,0,I am having a lot of anxiety about quitting my...,3.481481
2,0,I have to deal with many people in hard financ...,3.565217
3,0,"I do, but often they are not going to get back...",5.000000
4,0,That is true but sometimes I feel like I shoul...,3.565217
5,0,Probably not. I was with the same company for ...,4.000000
6,0,I could try. It mostly gets to me at the end o...,2.850000
7,0,That is also true. Sometimes I wonder if it re...,3.043478
8,0,That is true. Maybe I just need to sit down an...,3.086957
9,0,It really is a big decision \n,1.705882


In [42]:
lexical_diversity_df = seeker_df.groupby('ConversationID')['Seeker Dialog'].apply(lambda x: lexical_diversity(x.tolist())).reset_index()
lexical_diversity_df.rename(columns={'Seeker Dialog': 'Lexical Diversity'}, inplace=True)

In [44]:
lexical_diversity_df

Unnamed: 0,ConversationID,Lexical Diversity
0,0,1.000000
1,1,1.000000
2,2,1.000000
3,3,1.000000
4,4,1.000000
5,5,1.000000
6,6,1.000000
7,7,1.000000
8,8,1.000000
9,9,1.000000
