In [1]:
import os
import json

In [None]:
# Directory containing JSON files
json_dir = "Raw_data"  # Change this to your actual directory

# List to store all merged data
merged_data = []

# Iterate over all JSON files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):  # Ensure it's a JSON file
        file_path = os.path.join(json_dir, filename)

        # Load JSON file
        with open(file_path, "r", encoding="utf-8") as file:
            json_data = json.load(file)

            # Append each file's data into merged_data
            merged_data.extend(json_data)

# Save the merged data to a single JSON file
with open("merged_questionnaires.json", "w", encoding="utf-8") as output_file:
    json.dump(merged_data, output_file, indent=4)

print("Merged JSON data with all information saved successfully!")

Merged JSON data with all information saved successfully!


In [3]:
import pandas as pd

In [None]:
with open("merged_questionnaires.json", "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Extract relevant fields
data = []
for entry in json_data:
    question_type = entry["type"]
    question = entry["question"]
    for option in entry["options"]:
        data.append([question_type, question, option["option"]])

# Create DataFrame
df_questionnaires = pd.DataFrame(data, columns=["Type", "Question", "Label"])
print(df_questionnaires)


              Type                  Question              Label
0    SINGLE_SELECT             Customer type       New customer
1    SINGLE_SELECT             Customer type  Existing customer
2    SINGLE_SELECT             Customer type            Partner
3    SINGLE_SELECT             Customer type          Applicant
4    SINGLE_SELECT     Customer satisfaction     Very satisfied
..             ...                       ...                ...
111   MULTI_SELECT  Who to copy in follow up      Sandro Kalter
112   MULTI_SELECT  Who to copy in follow up     Jens Roschmann
113   MULTI_SELECT  Who to copy in follow up       Domiki Stein
114   MULTI_SELECT  Who to copy in follow up        Sean Kennin
115   MULTI_SELECT  Who to copy in follow up        Tim Persson

[116 rows x 3 columns]


In [16]:
df_single_select_questions = df_questionnaires[df_questionnaires["Type"] == "SINGLE_SELECT"]
df_single_select_questions

Unnamed: 0,Type,Question,Label
0,SINGLE_SELECT,Customer type,New customer
1,SINGLE_SELECT,Customer type,Existing customer
2,SINGLE_SELECT,Customer type,Partner
3,SINGLE_SELECT,Customer type,Applicant
4,SINGLE_SELECT,Customer satisfaction,Very satisfied
5,SINGLE_SELECT,Customer satisfaction,Satisfied
6,SINGLE_SELECT,Customer satisfaction,Unsatisfied
7,SINGLE_SELECT,Customer satisfaction,Very unsatisfied
8,SINGLE_SELECT,Size of the trade fair team (on average),1-5
9,SINGLE_SELECT,Size of the trade fair team (on average),6-10


In [5]:
df_multi_select_questions = df_questionnaires[df_questionnaires["Type"] == "MULTI_SELECT"]
df_multi_select_questions

Unnamed: 0,Type,Question,Label
23,MULTI_SELECT,Productinterests,BusinessCards
24,MULTI_SELECT,Productinterests,DataEnrichment
25,MULTI_SELECT,Productinterests,VisitReport
26,MULTI_SELECT,Productinterests,Data Cleansing
27,MULTI_SELECT,Productinterests,DataQuality
28,MULTI_SELECT,Searches a solution for,Scan business cards
29,MULTI_SELECT,Searches a solution for,Clean up CRM
30,MULTI_SELECT,Searches a solution for,Extract data from emails
31,MULTI_SELECT,Searches a solution for,Improve CRM data quality
32,MULTI_SELECT,Searches a solution for,Capture trade fair contacts


In [6]:
import google.generativeai as genai

In [83]:
import os

In [85]:
genai.configure(api_key=os.getenv('gemini_api'))

In [8]:
def api_call_for_generating_question(question):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        #prompt = f"Generate diverse responses for the following question with the answer label '{label}': {question}"
        prompt = f"Generate a full understandable and short question based on the following: {question}. Direct the message to me. Print the question only!"

        response = model.generate_content(prompt)
        #print(response.text.strip())
        return response.text.strip()
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return prompt  # Fallback to original if API fails

In [9]:
import time

In [10]:
def generate_question(df):
    generated_questions = dict()

    for question in df["Question"]:

        if question not in generated_questions.keys():
            full_question = api_call_for_generating_question(question)
            generated_questions[question] = full_question
            print(f"{question}: {full_question}")
            time.sleep(3)
            
    df['Question'] = df['Question'].map(generated_questions)
    #print("Questions in dataframe with new Questions replaced.")

    return df


In [22]:
df_single_select_questions = generate_question(df_single_select_questions)

Customer type: What type of customer are you?
Customer satisfaction: How satisfied are you with our service?
Size of the trade fair team (on average): What's the average size of your trade fair team?
CRM-System: What CRM system are you currently using?
Next steps: What are the next steps?
Which language is wanted for communication? : What language should we use to communicate?
What type of company is it?: What kind of company is this?
What is the size of your company?: How many people work at your company?
Would you like to receive marketing information from via e-mail?: Want email marketing updates?
What industry are you operating in?: What industry are you in?
Data processing consent: Do you consent to the processing of your data?
Customer group: What is the specific customer group you're targeting?


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Question'] = df['Question'].map(generated_questions)


In [23]:
df_single_select_questions

Unnamed: 0,Type,Question,Label
0,SINGLE_SELECT,What type of customer are you?,New customer
1,SINGLE_SELECT,What type of customer are you?,Existing customer
2,SINGLE_SELECT,What type of customer are you?,Partner
3,SINGLE_SELECT,What type of customer are you?,Applicant
4,SINGLE_SELECT,How satisfied are you with our service?,Very satisfied
5,SINGLE_SELECT,How satisfied are you with our service?,Satisfied
6,SINGLE_SELECT,How satisfied are you with our service?,Unsatisfied
7,SINGLE_SELECT,How satisfied are you with our service?,Very unsatisfied
8,SINGLE_SELECT,What's the average size of your trade fair team?,1-5
9,SINGLE_SELECT,What's the average size of your trade fair team?,6-10


In [11]:
df_multi_select_questions = generate_question(df_multi_select_questions)

Productinterests: What are your product interests?
Searches a solution for: What problem are you trying to solve?
What is the type of contact?: What type of contact is it?
What is the contact person interested in?: What are you interested in?
When does the contact person wish to receive a follow up?: When would you like a follow-up?
What products are you interested in?: What products interest you?
Products interested in: What products are you interested in?
What kind of follow up is planned: What follow-up is planned?
Who to copy in follow up: Who should I CC on the follow-up?


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Question'] = df['Question'].map(generated_questions)


In [12]:
df_multi_select_questions

Unnamed: 0,Type,Question,Label
23,MULTI_SELECT,What are your product interests?,BusinessCards
24,MULTI_SELECT,What are your product interests?,DataEnrichment
25,MULTI_SELECT,What are your product interests?,VisitReport
26,MULTI_SELECT,What are your product interests?,Data Cleansing
27,MULTI_SELECT,What are your product interests?,DataQuality
28,MULTI_SELECT,What problem are you trying to solve?,Scan business cards
29,MULTI_SELECT,What problem are you trying to solve?,Clean up CRM
30,MULTI_SELECT,What problem are you trying to solve?,Extract data from emails
31,MULTI_SELECT,What problem are you trying to solve?,Improve CRM data quality
32,MULTI_SELECT,What problem are you trying to solve?,Capture trade fair contacts


In [26]:
def make_api_call_for_answers(question, label):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        #prompt = f"Generate diverse responses for the following question with the answer label '{label}': {question}"
        prompt = f"Generate 100 full diverse answers as one sentence split in rows for the following context '{question}' with the answer label : '{label}'. Print the answers ONLY. If the label is yes or no also include answers without the label."
        
        response = model.generate_content(prompt)
        print(f"Answers for Question \"{question}\" with label \"{label}\" generated.")
        return response.text.strip()
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return prompt  # Fallback to original if API fails

In [13]:
def make_api_call_for_answers_multiselect(question, labels):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        #prompt = f"Generate diverse responses for the following question with the answer label '{label}': {question}"
        #prompt = f"Generate 100 full diverse answers as one sentence split in rows for the following context '{question}' with the possible answer labels : '{labels}'. Consider that multiple labels can be selected for answering. Print the answers followed up by the labels you used to answer in a new line ONLY. Also include answers without the label."
                
        prompt = f"Generate 100 full diverse answers as one sentence split in rows for the following context '{question}' with the possible answer labels : '{labels}'. Consider that multiple labels can be selected for answering, so include answers with all the possible combinations. Print in the generated answer followd up by the used labels in brackets ONLY. Also include answers without the label. Do not print additional information."

        response = model.generate_content(prompt).text.strip()
        print(f"Answers for Question \"{question}\" with label \"{labels}\" generated.")
        return response
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return prompt  # Fallback to original if API fails

In [14]:
import datetime

In [28]:
def generate_diverse_answers(df):

    generated_answers = []

    for _, row in df.iterrows():
        type = row["Type"]
        question = row["Question"]
        label = row["Label"]

        answers = make_api_call_for_answers(question, label)

        each_response = answers.split("\n")

        for curr_response in each_response:
                generated_answers.append({
                    #'question_id': row['question_id'],
                    'question': question,
                    'type': type,
                    'answer_text': curr_response,
                    'answer_label': label,
                    'timestamp': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")  # Aktueller Timestamp
                })
        
        time.sleep(3)

    return generated_answers

In [15]:
import re

In [27]:
import pdb


In [35]:
def generate_diverse_answers_multi_select(df):

    generated_answers = []
    processed_questions = set()

    for _, row in df.iterrows():
        type = row["Type"]
        question = row["Question"]
        labels = df[df['Question'] == question]['Label'].tolist()

        if question not in processed_questions:

            processed_questions.add(question)

            answers = make_api_call_for_answers_multiselect(question, labels)

            #answers.pop(0)
            #answers.pop()

            answers_with_tags = []
            pattern = r'^(.*?)\s+\[([^\]]+)\]$'

            for answer in answers.split("\n"):
                
                # Using re.match to extract both text and tags
                match = re.match(pattern, answer)
                #print(match.group(0))
                #print(match.group(1))
                #
                #print(match.group(2))

                try:
                    # Your code that may raise an exception
                    if match is not None:
                        answers_with_tags.append({
                        "answer": match.group(1),
                        "label": match.group(2)
                        })
                except AttributeError as e:  # Replace TypeError with the specific exception you want to catch
                    pdb.set_trace()  # Opens an interactive debugger session
                    print(f"Caught an exception: {e}")
                                
                #answers_with_tags[match.group(1)] = match.group(2)
            
            for row in answers_with_tags:
                generated_answers.append({
                        #'question_id': row['question_id'],
                        'question': question,
                        'type': type,
                        'answer_text': row["answer"],
                        'answer_label': row["label"],
                        'timestamp': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")  # Aktueller Timestamp
                })
            
            time.sleep(3)

    return generated_answers

In [53]:
df_multi_select_with_new_q_and_a = generate_diverse_answers_multi_select(df_multi_select_questions)

Answers for Question "What are your product interests?" with label "['BusinessCards', 'DataEnrichment', 'VisitReport', 'Data Cleansing', 'DataQuality']" generated.
Answers for Question "What problem are you trying to solve?" with label "['Scan business cards', 'Clean up CRM', 'Extract data from emails', 'Improve CRM data quality', 'Capture trade fair contacts']" generated.
Answers for Question "What type of contact is it?" with label "['Existing customer', 'Supplier', 'New customer / Prospect', 'Press / media', 'Competitor']" generated.
Answers for Question "What are you interested in?" with label "['100 Additive Manufacturing', '200 Automation', '300 Advanced Manufacturing', '234 Assembly Systems', '256 Joining Systems for large components', 'Others']" generated.
Answers for Question "When would you like a follow-up?" with label "['1 week', '2 weeks', '3 weeks']" generated.
Answers for Question "What products interest you?" with label "['Automotive radar target simulation', 'Noise fig

In [70]:
for item in df_multi_select_with_new_q_and_a:
    item['answer_label'] = item['answer_label'].replace('/', ',')

In [71]:
df_multi_select_with_new_q_and_a 

[{'question': 'What are your product interests?',
  'type': 'MULTI_SELECT',
  'answer_text': "I'm interested in improving data quality and generating business cards.",
  'answer_label': 'DataQuality, BusinessCards',
  'timestamp': '2025-01-25T21:31:30.976867'},
 {'question': 'What are your product interests?',
  'type': 'MULTI_SELECT',
  'answer_text': 'My priorities are data enrichment and visit reports.',
  'answer_label': 'DataEnrichment, VisitReport',
  'timestamp': '2025-01-25T21:31:30.976977'},
 {'question': 'What are your product interests?',
  'type': 'MULTI_SELECT',
  'answer_text': 'I need data cleansing and improved data quality.',
  'answer_label': 'Data Cleansing, DataQuality',
  'timestamp': '2025-01-25T21:31:30.977001'},
 {'question': 'What are your product interests?',
  'type': 'MULTI_SELECT',
  'answer_text': "I'm focused on business cards and data cleansing.",
  'answer_label': 'BusinessCards, Data Cleansing',
  'timestamp': '2025-01-25T21:31:30.977012'},
 {'question

In [29]:
df_single_select_with_new_q_and_a = generate_diverse_answers(df_single_select_questions)

Answers for Question "What type of customer are you?" with label "New customer" generated.
Answers for Question "What type of customer are you?" with label "Existing customer" generated.
Answers for Question "What type of customer are you?" with label "Partner" generated.
Answers for Question "What type of customer are you?" with label "Applicant" generated.
Answers for Question "How satisfied are you with our service?" with label "Very satisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Satisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Unsatisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Very unsatisfied" generated.
Answers for Question "What's the average size of your trade fair team?" with label "1-5" generated.
Answers for Question "What's the average size of your trade fair team?" with label "6-10" generated.
Answers for Question "What's the ave

In [32]:
df_single_select_with_new_q_and_a

[{'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a first-time buyer exploring your offerings.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492617'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': 'This is my initial purchase from your company.',
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492675'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a new customer looking for information.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492687'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I've never used your services before.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492696'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a brand-new customer, exc

In [33]:
for entry in data:
    question_id = entry["question_id"]
    question = entry["question"]
    question_type = entry["type"]
    for answer in entry["answers"]:
        rows.append({
            "question_id": question_id,
            "question": question,
            "type": question_type,
            "answer_text": answer["text"],
            "answer_label": answer["label"],
            "timestamp": answer["timestamp"]
        })

{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I'm a first-time buyer exploring your offerings.", 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492617'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': 'This is my initial purchase from your company.', 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492675'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I'm a new customer looking for information.", 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492687'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I've never used your services before.", 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492696'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I'm a brand-new customer, excited to try your product.", 'answer_label': '

In [75]:
from collections import defaultdict

In [81]:
# Grouping data

def save_in_json(data, filename):
    grouped_data = defaultdict(lambda: {'type': None, 'answers': []})

    for entry in data:
        question = entry['question']
        if grouped_data[question]['type'] is None:
            grouped_data[question]['type'] = entry['type']
        grouped_data[question]['answers'].append({
            'answer_text': entry['answer_text'],
            'answer_label': entry['answer_label'].replace("  ", " ").strip(),
            'timestamp': entry['timestamp']
        })

    # Convert to final JSON structure
    final_json = [
        {
            'question': question,
            'type': details['type'],
            'answers': details['answers']
        }
        for question, details in grouped_data.items()
    ]

    # Save to file
    with open(filename, 'w') as f:
        json.dump(final_json, f, indent=4)

    # Print output
    print(json.dumps(final_json, indent=4))

In [41]:
save_in_json(df_single_select_with_new_q_and_a, 'final_single_question_data.json')

[
    {
        "question": "What type of customer are you?",
        "type": "SINGLE_SELECT",
        "answers": [
            {
                "answer_text": "I'm a first-time buyer exploring your offerings.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492617"
            },
            {
                "answer_text": "This is my initial purchase from your company.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492675"
            },
            {
                "answer_text": "I'm a new customer looking for information.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492687"
            },
            {
                "answer_text": "I've never used your services before.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492696"
            },
            {
                

In [82]:
save_in_json(df_multi_select_with_new_q_and_a, "final_multi_question_data.json")

[
    {
        "question": "What are your product interests?",
        "type": "MULTI_SELECT",
        "answers": [
            {
                "answer_text": "I'm interested in improving data quality and generating business cards.",
                "answer_label": "DataQuality, BusinessCards",
                "timestamp": "2025-01-25T21:31:30.976867"
            },
            {
                "answer_text": "My priorities are data enrichment and visit reports.",
                "answer_label": "DataEnrichment, VisitReport",
                "timestamp": "2025-01-25T21:31:30.976977"
            },
            {
                "answer_text": "I need data cleansing and improved data quality.",
                "answer_label": "Data Cleansing, DataQuality",
                "timestamp": "2025-01-25T21:31:30.977001"
            },
            {
                "answer_text": "I'm focused on business cards and data cleansing.",
                "answer_label": "BusinessCards, Data Cleansi