In [12]:
import os
import json

In [13]:
# Directory containing JSON files
json_dir = "Raw_data"  # Change this to your actual directory

# List to store all merged data
merged_data = []

# Iterate over all JSON files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):  # Ensure it's a JSON file
        file_path = os.path.join(json_dir, filename)

        # Load JSON file
        with open(file_path, "r", encoding="utf-8") as file:
            json_data = json.load(file)

            # Append each file's data into merged_data
            merged_data.extend(json_data)

# Save the merged data to a single JSON file
with open("merged_questionnaires.json", "w", encoding="utf-8") as output_file:
    json.dump(merged_data, output_file, indent=4)

print("Merged JSON data with all information saved successfully!")

Merged JSON data with all information saved successfully!


In [14]:
import pandas as pd

In [15]:
with open("merged_questionnaires.json", "r", encoding="utf-8") as file:
    json_data = json.load(file)

# Extract relevant fields
data = []
for entry in json_data:
    question_type = entry["type"]
    question = entry["question"]
    for option in entry["options"]:
        data.append([question_type, question, option["option"]])

# Create DataFrame
df_questionnaires = pd.DataFrame(data, columns=["Type", "Question", "Label"])
print(df_questionnaires)


              Type                  Question              Label
0    SINGLE_SELECT             Customer type       New customer
1    SINGLE_SELECT             Customer type  Existing customer
2    SINGLE_SELECT             Customer type            Partner
3    SINGLE_SELECT             Customer type          Applicant
4    SINGLE_SELECT     Customer satisfaction     Very satisfied
..             ...                       ...                ...
111   MULTI_SELECT  Who to copy in follow up      Sandro Kalter
112   MULTI_SELECT  Who to copy in follow up     Jens Roschmann
113   MULTI_SELECT  Who to copy in follow up       Domiki Stein
114   MULTI_SELECT  Who to copy in follow up        Sean Kennin
115   MULTI_SELECT  Who to copy in follow up        Tim Persson

[116 rows x 3 columns]


In [16]:
df_single_select_questions = df_questionnaires[df_questionnaires["Type"] == "SINGLE_SELECT"]
df_single_select_questions

Unnamed: 0,Type,Question,Label
0,SINGLE_SELECT,Customer type,New customer
1,SINGLE_SELECT,Customer type,Existing customer
2,SINGLE_SELECT,Customer type,Partner
3,SINGLE_SELECT,Customer type,Applicant
4,SINGLE_SELECT,Customer satisfaction,Very satisfied
5,SINGLE_SELECT,Customer satisfaction,Satisfied
6,SINGLE_SELECT,Customer satisfaction,Unsatisfied
7,SINGLE_SELECT,Customer satisfaction,Very unsatisfied
8,SINGLE_SELECT,Size of the trade fair team (on average),1-5
9,SINGLE_SELECT,Size of the trade fair team (on average),6-10


In [17]:
import google.generativeai as genai

In [18]:
genai.configure(api_key="")

In [19]:
def api_call_for_generating_question(question):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        #prompt = f"Generate diverse responses for the following question with the answer label '{label}': {question}"
        prompt = f"Generate a full understandable and short question based on the following: {question}. Direct the message to me. Print the question only!"

        response = model.generate_content(prompt)
        #print(response.text.strip())
        return response.text.strip()
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return prompt  # Fallback to original if API fails

In [20]:
import time

In [21]:
def generate_question(df):
    generated_questions = dict()

    for question in df["Question"]:

        if question not in generated_questions.keys():
            full_question = api_call_for_generating_question(question)
            generated_questions[question] = full_question
            print(f"{question}: {full_question}")
            time.sleep(3)
            
    df['Question'] = df['Question'].map(generated_questions)
    #print("Questions in dataframe with new Questions replaced.")

    return df


In [22]:
df_single_select_questions = generate_question(df_single_select_questions)

Customer type: What type of customer are you?
Customer satisfaction: How satisfied are you with our service?
Size of the trade fair team (on average): What's the average size of your trade fair team?
CRM-System: What CRM system are you currently using?
Next steps: What are the next steps?
Which language is wanted for communication? : What language should we use to communicate?
What type of company is it?: What kind of company is this?
What is the size of your company?: How many people work at your company?
Would you like to receive marketing information from via e-mail?: Want email marketing updates?
What industry are you operating in?: What industry are you in?
Data processing consent: Do you consent to the processing of your data?
Customer group: What is the specific customer group you're targeting?


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Question'] = df['Question'].map(generated_questions)


In [23]:
df_single_select_questions

Unnamed: 0,Type,Question,Label
0,SINGLE_SELECT,What type of customer are you?,New customer
1,SINGLE_SELECT,What type of customer are you?,Existing customer
2,SINGLE_SELECT,What type of customer are you?,Partner
3,SINGLE_SELECT,What type of customer are you?,Applicant
4,SINGLE_SELECT,How satisfied are you with our service?,Very satisfied
5,SINGLE_SELECT,How satisfied are you with our service?,Satisfied
6,SINGLE_SELECT,How satisfied are you with our service?,Unsatisfied
7,SINGLE_SELECT,How satisfied are you with our service?,Very unsatisfied
8,SINGLE_SELECT,What's the average size of your trade fair team?,1-5
9,SINGLE_SELECT,What's the average size of your trade fair team?,6-10


In [26]:
def make_api_call_for_answers(question, label):
    try:
        model = genai.GenerativeModel("gemini-1.5-flash")
        #prompt = f"Generate diverse responses for the following question with the answer label '{label}': {question}"
        prompt = f"Generate 100 full diverse answers as one sentence split in rows for the following context '{question}' with the answer label : '{label}'. Print the answers ONLY. If the label is yes or no also include answers without the label."
        
        response = model.generate_content(prompt)
        print(f"Answers for Question \"{question}\" with label \"{label}\" generated.")
        return response.text.strip()
    except Exception as e:
        print(f"Error with Gemini API: {e}")
        return prompt  # Fallback to original if API fails

In [27]:
import datetime

In [28]:
def generate_diverse_answers(df):

    generated_answers = []

    for _, row in df.iterrows():
        type = row["Type"]
        question = row["Question"]
        label = row["Label"]

        answers = make_api_call_for_answers(question, label)

        each_response = answers.split("\n")

        for curr_response in each_response:
                generated_answers.append({
                    #'question_id': row['question_id'],
                    'question': question,
                    'type': type,
                    'answer_text': curr_response,
                    'answer_label': label,
                    'timestamp': datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")  # Aktueller Timestamp
                })
        
        time.sleep(3)

    return generated_answers

In [29]:
df_single_select_with_new_q_and_a = generate_diverse_answers(df_single_select_questions)

Answers for Question "What type of customer are you?" with label "New customer" generated.
Answers for Question "What type of customer are you?" with label "Existing customer" generated.
Answers for Question "What type of customer are you?" with label "Partner" generated.
Answers for Question "What type of customer are you?" with label "Applicant" generated.
Answers for Question "How satisfied are you with our service?" with label "Very satisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Satisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Unsatisfied" generated.
Answers for Question "How satisfied are you with our service?" with label "Very unsatisfied" generated.
Answers for Question "What's the average size of your trade fair team?" with label "1-5" generated.
Answers for Question "What's the average size of your trade fair team?" with label "6-10" generated.
Answers for Question "What's the ave

In [32]:
df_single_select_with_new_q_and_a

[{'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a first-time buyer exploring your offerings.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492617'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': 'This is my initial purchase from your company.',
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492675'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a new customer looking for information.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492687'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I've never used your services before.",
  'answer_label': 'New customer',
  'timestamp': '2025-01-24T00:52:21.492696'},
 {'question': 'What type of customer are you?',
  'type': 'SINGLE_SELECT',
  'answer_text': "I'm a brand-new customer, exc

In [33]:
for entry in data:
    question_id = entry["question_id"]
    question = entry["question"]
    question_type = entry["type"]
    for answer in entry["answers"]:
        rows.append({
            "question_id": question_id,
            "question": question,
            "type": question_type,
            "answer_text": answer["text"],
            "answer_label": answer["label"],
            "timestamp": answer["timestamp"]
        })

{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I'm a first-time buyer exploring your offerings.", 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492617'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': 'This is my initial purchase from your company.', 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492675'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I'm a new customer looking for information.", 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492687'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I've never used your services before.", 'answer_label': 'New customer', 'timestamp': '2025-01-24T00:52:21.492696'}
{'question': 'What type of customer are you?', 'type': 'SINGLE_SELECT', 'answer_text': "I'm a brand-new customer, excited to try your product.", 'answer_label': '

In [36]:
from collections import defaultdict

In [40]:
# Grouping data

def save_in_json(data):
    grouped_data = defaultdict(lambda: {'type': None, 'answers': []})

    for entry in data:
        question = entry['question']
        if grouped_data[question]['type'] is None:
            grouped_data[question]['type'] = entry['type']
        grouped_data[question]['answers'].append({
            'answer_text': entry['answer_text'],
            'answer_label': entry['answer_label'],
            'timestamp': entry['timestamp']
        })

    # Convert to final JSON structure
    final_json = [
        {
            'question': question,
            'type': details['type'],
            'answers': details['answers']
        }
        for question, details in grouped_data.items()
    ]

    # Save to file
    with open('final_single_question_data.json', 'w') as f:
        json.dump(final_json, f, indent=4)

    # Print output
    print(json.dumps(final_json, indent=4))

In [41]:
save_in_json(df_single_select_with_new_q_and_a)

[
    {
        "question": "What type of customer are you?",
        "type": "SINGLE_SELECT",
        "answers": [
            {
                "answer_text": "I'm a first-time buyer exploring your offerings.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492617"
            },
            {
                "answer_text": "This is my initial purchase from your company.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492675"
            },
            {
                "answer_text": "I'm a new customer looking for information.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492687"
            },
            {
                "answer_text": "I've never used your services before.",
                "answer_label": "New customer",
                "timestamp": "2025-01-24T00:52:21.492696"
            },
            {
                

In [None]:
#df_single_select_questions['Question'] = df_single_select_questions['Question'].map(generated_single_select_questions)

In [None]:
#df_single_select_questions