In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
import matplotlib.pyplot as plt
import json

# Initialize an empty list to store the lines
lines = []

# Open the JSONL file and read the first few lines
with open('/kaggle/input/pippa-unfiltered/pippa_metharme.jsonl', 'r') as f:
    for i, line in enumerate(f):
        if i >= 5:  # Stop after reading 5 lines
            break
        lines.append(json.loads(line))

# Convert the list of dictionaries to a Pandas DataFrame
df = pd.DataFrame(lines)

# Display the head of the DataFrame
print(df.head())


In [None]:
%%time
import seaborn as sns

lines = []
with open('/kaggle/input/pippa-unfiltered/pippa_metharme.jsonl', 'r') as f:
    for line in f:
        lines.append(json.loads(line))

df = pd.DataFrame(lines)


In [None]:
%%time
# Display basic information about the DataFrame
print(df.info())

# Number of unique prompts
num_unique_prompts = df['prompt'].nunique()
print(f"Number of unique prompts: {num_unique_prompts}")

# Average length of generations
avg_gen_length = df['generation'].apply(len).mean()
print(f"Average length of generations: {avg_gen_length}")


In [None]:
%%time

import pandas as pd
from nltk.tokenize import sent_tokenize

# Initialize an empty list to hold the individual sentences
sentences_list = []

# Iterate through each prompt and tokenize it into sentences
for prompt in df['prompt']:
    sentences = sent_tokenize(prompt)
    sentences_list.extend(sentences)

# Create a new DataFrame with the individual sentences
df_sentences = pd.DataFrame(sentences_list, columns=['sentences'])

# Show details of the new DataFrame
df_sentences.info()
df_sentences.head()

In [None]:
%%time
# Filter for shorter entries and make a copy
df_short = df_sentences[df_sentences['sentences'].apply(len) < 100].copy()

# Create a new column for text length
df_short['gen_length'] = df_short['sentences'].apply(len)

# Plotting the distribution of text lengths
sns.histplot(df_short['gen_length'], kde=True)
plt.title('Distribution of Prompt Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

# Show details of the new DataFrame
df_short.info()
df_short.head()

In [None]:
# Sample code to remove duplicates and check the structure of the DataFrame
# This assumes 'df_sentences' is your DataFrame containing tokenized sentences.

# Remove duplicate entries based on the 'sentences' column
df_short.drop_duplicates(subset=['sentences'], inplace=True)

# Show basic information about the DataFrame after removing duplicates
df_short.info()

# Show the first few entries for inspection
df_short.head()

In [None]:
%%time
# Create a sample DataFrame for demonstration purposes
import re

# Function to remove special characters like <|START|>, <|END|>, etc.
def clean_text(text):
    return re.sub('<\|.*?\|>', '', text).strip()


# Function to remove emojis and other special characters
def raw_text(text):
    return re.sub('[^a-zA-Z0-9 \n\.]', '', text)

# Apply the cleaning function
df_short['prompt'] = df_short['sentences'].apply(clean_text)


# Apply the cleaning function to the 'sentences' column
dataset = pd.DataFrame()
dataset['prompt'] = df_short['prompt'].apply(raw_text)

# Reindexing the DataFrame so that entry IDs match the row number
dataset.reset_index(drop=True, inplace=True)

dataset

In [None]:
# Show details of the new DataFrame
dataset.info()
dataset.head()

In [None]:
# Function to check if a sentence is properly formed
def is_proper_sentence(sentence):
    # Check for minimum length
    if len(sentence) < 5:
        return False
    
    # Check if the sentence starts with a capital letter
    if not sentence[0].isupper():
        return False
    
    # Check if the sentence ends with a full stop, question mark, or exclamation point
    if sentence[-1] not in [".", "?", "!"]:
        return False
    
    # Check for the presence of both a subject and a verb
    # For this example, we will make a very rudimentary check by looking for spaces (indicative of multiple words)
    if sentence.count(" ") < 2:
        return False
    
    return True

# Filter the DataFrame
dataset_filtered = dataset[dataset['prompt'].apply(is_proper_sentence)]
# Reindexing the DataFrame so that entry IDs match the row number
dataset_filtered.reset_index(drop=True, inplace=True)


dataset_filtered

# DEPLOY ELIZA MENTOR @PROFESSOR_HENRY_HIGGINS

In [None]:
# ELIZA Keywords

# Pre-substitution list
PRES = {
    "don't": "dont",
    "can't": "cant",
    "won't": "wont",
    "ain't": "aint",
    "recollect": "remember",
    "recall": "remember",
    "dreamt": "dreamed",
    "dreams": "dream",
    "maybe": "perhaps",
    "certainly": "yes",
    "machine": "computer",
    "machines": "computer",
    "were": "was",
    "you're": "you are",
    "it's": "it is",
    "i'm": "i am",
    "same": "alike",
    "identical": "alike",
    "equivalent": "alike"
}

# Post-substitution list
POSTS = {
    "am": "are",
    "was": "were",
    "i": "you",
    "i'm": "you are",
    "i'd": "you would",
    "i've": "you have",
    "i'll": "you will",
    "my": "your",
    "are": "am",
    "you've": "I have",
    "you'll": "I will",
    "your": "my",
    "yours": "mine",
    "you": "I",
    "me": "you"
}

# Natural Language Toolkit: Keyword Pairs
KEYWORDS = [
    ['i desire (.*)', ["Why do you need {0}?", "Would it really help you to get {0}?", "Are you sure you need {0}?"]],
    ['(.*) juice (.*)', ["It's nice and sweet. It's a really good batch!", "I have blueberry juice, apple juice, lemon juice...", "It's really good. You're going to love it."]],
    ['(.*) i forget (.*)', ["Can you think of why you might forget {1}?", "Why can't you remember {1}?", "How often do you think of {1}?", "Does it bother you to forget that?", "Could it be a mental block?", "Are you generally forgetful?", "Do you think you are suppressing {1}?"]],
    ['(.*) did you forget (.*)', ["Why do you ask?", "Are you sure you told me?", "Would it bother you if I forgot {1}?", "Why should I recall {1} just now?", "Tell me more about {1}."]],
    ['(.*) name (.*)', ["I am not interested in names.", "I've told you before, I don't care about names -- please continue."]],
    ['why dont you ([^\?]*)\??', ["Do you really think I don't {0}?", "Perhaps eventually I will {0}.", "Do you really want me to {0}?"]],
    ['why cant i ([^\?]*)\??', ["Do you think you should be able to {0}?", "If you could {0}, what would you do?", "I don't know -- why can't you {0}?", "Have you really tried?"]],
    ['i cant (.*)', ["How do you know you can't {0}?", "Perhaps you could {0} if you tried.", "What would it take for you to {0}?"]],
    ['(.*) i am sad (.*)', ["I am sorry to hear that you are unhappy {1}.", "Do you think coming here will help you not to be depressed {1}?", "I'm sure it's not pleasant to be unhappy {1}.", "Can you explain what made you unhappy {1}?", "You sound really depressed.", "Do you feel inadequate because you are unhappy {1}?"]],
    ['i am (.*)', ["Did you come to me because you are {0}?", "How long have you been {0}?", "How do you feel about being {0}?"]],
    ['im (.*)', ["How does being {0} make you feel?", "Do you enjoy being {0}?", "Why do you tell me you're {0}?", "Why do you think you're {0}?"]],
    ['are you ([^\?]*)\??', ["Why does it matter whether I am {0}?", "Would you prefer it if I were not {0}?", "Perhaps you believe I am {0}.", "I may be {0} -- what do you think?", "Why are you interested in whether I am {0} or not ?", "Would you prefer if I weren't {0} ?", "Perhaps I am {0} in your fantasies.", "Do you sometimes think I am {0} ?", "Would it matter to you ?", "What if I were {0} ?"]],
    ['what (.*)', ["Why do you ask?", "How would an answer to that help you?", "What do you think?", "Does that question interest you?", "What is it you really want to know?", "Are such questions much on your mind?", "What answer would please you most?", "What comes to mind when you ask that?", "Have you asked such questions before?", "Have you asked anyone else?"]],
    ['how (.*)', ["How do you suppose?", "Perhaps you can answer your own question.", "What is it you're really asking?"]],
    ['because (.*)', ["Is that the real reason?", "What other reasons come to mind?", "Does that reason apply to anything else?", "If {0}, what else must be true?", "Is that the real reason?", "Don't any other reasons come to mind?", "Does that reason seem to explain anything else?", "What other reasons might there be?"]],
    ['(.*) sorry (.*)', ["There are many times when no apology is needed.", "Apologies are not necessary.", "I have told you that apologies are not required.",  "It did not bother me.  Please continue.", "What feelings do you have when you apologize?"]],
    ['hello(.*)', ["Hello... I'm glad you could drop by today.", "Hi there... how are you today?", "Hello, how are you feeling today?"]],
    ['i think (.*)', ["Do you doubt {0}?", "Do you really think so?", "But you're not sure {0}?"]],
    ['(.*) friend (.*)', ["Tell me more about your friends.", "When you think of a friend, what comes to mind?", "Why don't you tell me about a childhood friend?"]],
    ['yes', ["You seem quite sure.", "OK, but can you elaborate a bit?"]],
    ['(.*) computer(.*)', ["Are you really talking about me?", "Does it seem strange to talk to a computer?", "How do computers make you feel?", "Do you feel threatened by computers?"]],
    ['is it (.*)', ["Do you think it is {0}?", "Perhaps it's {0} -- what do you think?", "If it were {0}, what would you do?", "It could well be that {0}."]],
    ['it is (.*)', ["You seem very certain.", "If I told you that it probably isn't {0}, what would you feel?"]],
    ['can you ([^\?]*)\??', ["What makes you think I can't {0}?", "If I could {0}, then what?", "Why do you ask if I can {0}?"]],
    ['can i ([^\?]*)\??', ["Perhaps you don't want to {0}.", "Do you want to be able to {0}?", "If you could {0}, would you?"]],
    ['you are (.*)', ["Why do you think I am {0}?", "Does it please you to think that I am {0}?", "Perhaps you would like me to be {0}.", "Perhaps you're really talking about yourself?"]],
    ['youre (.*)', ["Why do you say I am {0}?", "Why do you think I am {0}?", "Are we talking about you, or me?"]],
    ['i dont (.*)', ["Don't you really {0}?", "Why don't you {0}?", "Do you want to {0}?"]],
    ['i feel (.*)', ["Good, tell me more about these feelings.", "Do you often feel {0}?", "When do you usually feel {0}?", "When you feel {0}, what do you do?"]],
    ['i have (.*)', ["Why do you tell me that you've {0}?", "Have you really {0}?", "Now that you have {0}, what will you do next?"]],
    ['i would (.*)', ["Could you explain why you would {0}?", "Why would you {0}?", "Who else knows that you would {0}?"]],
    ['is there (.*)', ["Do you think there is {0}?", "It's likely that there is {0}.", "Would you like there to be {0}?"]],
    ['my (.*)', ["I see, your {0}.", "Why do you say that your {0}?", "When your {0}, how do you feel?"]],
    ['you (.*)', ["We should be discussing you, not me.", "Why do you say that about me?", "Why do you care whether I {0}?"]],
    ['why (.*)', ["Why don't you tell me the reason why {0}?", "Why do you think {0}?"]],
    ['why dont you (.*)', ["Do you believe I do not {0}?", "Perhaps I will {0} in good time.", "Should you {0} yourself?", "You want me to {0}?"]],
    ['why cant i (.*)', ["Do you think you should be able to {0}?", "Do you want to be able to {0}?", "Do you believe this will help you to {0}?", "Have you any idea why you can't {0}?"]],
    ['everyone (.*)', ["Really, {0}?", "Surely not {0}.", "Can you think of anyone in particular?", "Who, for example?", "Are you thinking of a very special person?", "Who, may I ask?", "Someone special perhaps?", "You have a particular person in mind, yes?", "Who do you think you're talking about?"]],
    ['i want (.*)', ["What would it mean to you if you got {0}?", "Why do you want {0}?", "What would you do if you got {0}?", "If you got {0}, then what would you do?"]],
    ['(.*) mother (.*)', ["Tell me more about your mother.", "What was your relationship with your mother like?", "How do you feel about your mother?", "How does this relate to your feelings today?", "Good family relations are important."]],
    ['(.*) father (.*)', ["Tell me more about your father.", "How did your father make you feel?", "How do you feel about your father?", "Does your relationship with your father relate to your feelings today?", "Do you have trouble showing affection with your family?"]],
    ['(.*) child (.*)', ["Did you have close friends as a child?", "What is your favorite childhood memory?", "Do you remember any dreams or nightmares from childhood?", "Did the other children sometimes tease you?", "How do you think your childhood experiences relate to your feelings today?"]],
    ['am i (.*)', ["Do you believe you are {0}?", "Would you want to be {0}?", "Do you wish I would tell you you are {0}?", "What would it mean if you were {0}?"]],
    ['(.*) if (.*)', ["Do you think it's likely that {1}?", "Do you wish that {1}?", "What do you know about {1}?", "Really, if {1}?", "What would you do if {1}?", "But what are the chances that {1}?", "What does this speculation lead to?"]],
    ['(.*) always (.*)', ["Can you think of a specific example?", "When?", "What incident are you thinking of?", "Really, always?"]],
    ['(.*) alike', ["In what way?", "What resemblance do you see?", "What does that similarity suggest to you?", "What other connections do you see?", "What do you suppose that resemblence means?", "What is the connection, do you suppose?", "Could there really be some connection?", "How?"]],
    ['like', ["In what way?", "What resemblance do you see?", "What does that similarity suggest to you?", "What other connections do you see?", "What do you suppose that resemblence means?", "What is the connection, do you suppose?", "Could there really be some connection?", "How?"]],
    ['(.*) my family (.*)', ["Tell me more about your family.", "Who else in your family {1}?", "Your {0}?", "What else comes to your mind when you think of your {1}?"]],
    ['(.*) my (.*)', ["Your {1}?", "Why do you say your {1}?", "Is it important to you that your {1}?"]],
    ['(.*)?', ["Why do you ask that?", "Please consider whether you can answer your own question.", "Perhaps the answer lies within yourself?", "Why don't you tell me?"]],
    ['(.*)', ["Please tell me more.", "Let's change focus a bit... Tell me about your family.", "Can you elaborate on that?", "Why do you say that {0}?", "I see.", "Very interesting.", "{0}.", "I see.  And what does that tell you?", "How does that make you feel?", "How do you feel when you say that?", "I'm not sure I understand you fully.", "Please go on.", "What does that suggest to you?", "Do you feel strongly about discussing such things?", "That is interesting.  Please continue.", "Tell me more about that.", "Does talking about this bother you?", "Why not? You should have some more juice!"]],
]

In [None]:
# ELIZA Main Script - TEAM MIND INTERFACES (LABLAB 24-HOUR FINE-TUNING HACKATHON)

import re
import random


# Function to apply pre-substitution on the statement
def pre(statement):
    words = statement.lower().split()
    for i, word in enumerate(words):
        if word in PRES:
            words[i] = PRES[word]
    return ' '.join(words)

# Function to apply post-substitution on the statement
def post(fragment):
    words = fragment.lower().split()
    for i, word in enumerate(words):
        if word in POSTS:
            words[i] = POSTS[word]
    return ' '.join(words)

# Function to analyze the statement and generate a response
def analyze(statement):
    pre_statement = pre(statement)
    for pattern, responses in KEYWORDS:
        match = re.match(pattern, pre_statement.rstrip(".!"))
        if match:
            response = random.choice(responses)
            return response.format(*[post(g) for g in match.groups()])

In [None]:
%%time
from tqdm import tqdm
tqdm.pandas()

# Apply the 'analyze' function to the 'prompt' column to create a new 'generation' column, with a progress bar
# dataset_filtered['generation'] = dataset_filtered['prompt'].progress_apply(analyze)
dataset_filtered.loc[:, 'generation'] = dataset_filtered['prompt'].progress_apply(analyze)


In [None]:
dataset_filtered

In [None]:
%%time
#CHECKPOINT
dataset_filtered.to_json('ELIZA_dataset.json', orient='records', lines=True)

In [None]:
%%time
# Make a copy to avoid SettingWithCopyWarning
dataset_filtered_copy = dataset_filtered.copy()

# Create a new column for text length
dataset_filtered_copy['gen_length'] = dataset_filtered_copy['generation'].apply(len)

# Plotting the distribution of text lengths
sns.histplot(dataset_filtered_copy['gen_length'], kde=True)
plt.title('Distribution of Response Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

# Show details of the new DataFrame
dataset_filtered_copy.info()
dataset_filtered_copy.head()


In [None]:
%%time
# Display the top 100 most frequent responses in the 'generation' column
top_10_responses = dataset_filtered_copy['generation'].value_counts().head(10)
top_10_responses

In [None]:
# Identify the rows that have the top 4 most frequent responses
mask = dataset_filtered['generation'].isin([
    'Why do you ask that?', 
    'Please consider whether you can answer your own question.',
    'Perhaps the answer lies within yourself?',
    'Why don\'t you tell me?'
])

# Select those rows
top_4_df = dataset_filtered[mask]

# Randomly sample 10% of those rows
top_4_df_reduced = top_4_df.sample(frac=0.1, random_state=1)

# Select the rows that are not in the top 4 most frequent responses
other_df = dataset_filtered[~mask]

# Concatenate the reduced top 4 DataFrame with the other DataFrame
balanced_dataset = pd.concat([top_4_df_reduced, other_df]).reset_index(drop=True)

# Show some details about the balanced dataset
balanced_dataset.info()

In [None]:
# Make a copy to avoid SettingWithCopyWarning
dataset_filtered_copy = balanced_dataset.copy()

# Create a new column for text length
dataset_filtered_copy['gen_length'] = dataset_filtered_copy['generation'].apply(len)

# Plotting the distribution of text lengths
sns.histplot(dataset_filtered_copy['gen_length'], kde=True)
plt.title('Distribution of Response Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

# Show details of the new DataFrame
dataset_filtered_copy.info()
dataset_filtered_copy.head()

In [None]:
%%time
# Display the top 10 most frequent responses in the 'generation' column
top_10_responses = dataset_filtered_copy['generation'].value_counts().head(10)
top_10_responses

In [None]:
# List of specific response pairs to reduce to approximately 1/7 of their current counts
responses_to_reduce_7th = [
    "Perhaps the answer lies within yourself?",
    "Please consider whether you can answer your own question.",
    "We should be discussing you, not me.",
    "Why do you say that about me?",
    "Why don't you tell me?",
    "Why do you ask that?"
]

# Reduce the frequency of these responses to approximately 1/7 of their current counts
dataset_reduced_7th = dataset_filtered_copy.copy()
for response in responses_to_reduce_7th:
    mask = dataset_reduced_7th['generation'] == response
    rows_to_keep = dataset_reduced_7th[mask].sample(frac=1/7, random_state=1)
    dataset_reduced_7th = pd.concat([dataset_reduced_7th[~mask], rows_to_keep])

# Shuffle the dataset to mix the reduced rows with the rest
dataset_reduced_7th = dataset_reduced_7th.sample(frac=1, random_state=1).reset_index(drop=True)

# Display the number of entries after reduction
len(dataset_reduced_7th), dataset_reduced_7th['generation'].value_counts().head(10)

In [None]:
# Make a copy to avoid SettingWithCopyWarning
dataset_filtered_copy = dataset_reduced_7th.copy()

# Create a new column for text length
dataset_filtered_copy['gen_length'] = dataset_filtered_copy['generation'].apply(len)

# Plotting the distribution of text lengths
sns.histplot(dataset_filtered_copy['gen_length'], kde=True)
plt.title('Distribution of Response Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

# Show details of the new DataFrame
dataset_filtered_copy.info()
dataset_filtered_copy.head()

In [None]:
%%time
#CHECKPOINT
dataset_filtered_copy.to_json('ELIZA_INSTRUCT.json', orient='records', lines=True)