In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import google.generativeai as genai
import json
import time
import re

In [3]:
genai.configure(api_key='AIzaSyDTgtMv4zB-01k1yyjShjmMR3E9h2vQpjI')
cfg = genai.types.GenerationConfig(max_output_tokens=4000)
sys_msg_train = (
'''
  Create 4 question and answer pairs for the text where the questions make sense, have proper context, avoid mentioning non-existent algorithms, equations, images.Format the output as:
    Q: [Question]
    A: [Answer].
'''
)
print(sys_msg_train)
model_train = genai.GenerativeModel('gemini-1.5-flash', system_instruction=sys_msg_train)



  Create 4 question and answer pairs for the text where the questions make sense, have proper context, avoid mentioning non-existent algorithms, equations, images.Format the output as:
    Q: [Question]
    A: [Answer].



In [4]:
input_folder = "/content/drive/MyDrive/DeepLearning_FinalProject/Final Report Content/Data_Post-Restructuring_with_Additional_Contextual_Tags_for_Question_Answer_Pairs/"
output_folder = "/content/drive/MyDrive/DeepLearning_FinalProject/Final Report Content/GeminiPrompt_QuestionGeneration_and_Training_Validation_Split/"

qa_pairs_training = []
qa_pairs_validation = []

In [5]:
def extract_qa_pairs(text):
    # Initialize lists to hold questions and answers
    questions = []
    answers = []

    # Define a regex pattern for question-answer formats
    pattern_question = r"Q:.*\nA:.*"  # Matches Q: and A: pairs

    # Find all matches of question-answer pairs
    qa_matches = re.findall(pattern_question, text)

    for match in qa_matches:
        # Split each match into question and answer based on the "A:" marker
        question_answer = match.split("\nA:")

        if len(question_answer) == 2:  # Ensure it's a valid question-answer pair
            question = question_answer[0].replace("Q: ", "").strip()
            answer = question_answer[1].strip()

            questions.append(question)
            answers.append(answer)

    # Combine questions and answers into pairs
    qa_pairs = [{"question": q, "answer": a} for q, a in zip(questions, answers)]

    return qa_pairs


In [6]:
#Test Extaract Q and A Pairs
sample_generated_data = '''
Please provide the text you wish me to create questions and answers for.  I need the content of the post from andrewgelman.com to formulate relevant questions and answers.

Processed Q&A for entry 0
Q: What is the primary optimization method discussed in the provided text?
A: Gradient descent.

Q: What is mentioned regarding the broader field of optimization?
A: There is a vast and interesting body of work on the mathematical and algorithmic foundations of optimization.

Q:  Does the text delve deeply into the complexities of optimization algorithms?
A: No, the text states it will focus on a simplified method instead.

Q: What is the characteristic of the gradient descent method described in the text?
A: It is described as one of the simplest methods.

Unexpected error for entry 1: list index out of range
Q: What is the purpose of the hypothesis 'h' in the given context?
A: The hypothesis 'h' serves as a model to solve a regression problem, mapping inputs (x) to outputs (ŷ).

Q: What type of problem is being addressed using the hypothesis 'h'?
A: A regression problem.

Q: What does the hypothesis 'h' do with the inputs 'x'?
A: It maps the inputs 'x' to outputs 'ŷ'.

Q:  What is represented by 'ŷ'?
A: 'ŷ' represents the outputs predicted by the hypothesis 'h'.

Unexpected error for entry 2: list index out of range
Here are four question-answer pairs based on the provided text, which appears to represent a simple input-process-output model:
'''

extract_qa_pairs(sample_generated_data)

[{'question': 'What is the primary optimization method discussed in the provided text?',
  'answer': 'Gradient descent.'},
 {'question': 'What is mentioned regarding the broader field of optimization?',
  'answer': 'There is a vast and interesting body of work on the mathematical and algorithmic foundations of optimization.'},
 {'question': 'Does the text delve deeply into the complexities of optimization algorithms?',
  'answer': 'No, the text states it will focus on a simplified method instead.'},
 {'question': 'What is the characteristic of the gradient descent method described in the text?',
  'answer': 'It is described as one of the simplest methods.'},
 {'question': "What is the purpose of the hypothesis 'h' in the given context?",
  'answer': "The hypothesis 'h' serves as a model to solve a regression problem, mapping inputs (x) to outputs (ŷ)."},
 {'question': "What type of problem is being addressed using the hypothesis 'h'?",
  'answer': 'A regression problem.'},
 {'question'

In [7]:
max_retries = 1  # Maximum number of retry attempts

t = 0
for i in range(6, 7):
    with open(input_folder + f"ContextualTags_MIT6390Notes_Chapter{i}", "r") as file:
        json_data = json.load(file)
        print("On Chapter "+ f"ContextualTags_MIT6390Notes_Chapter{i}, number of entries: ", len(json_data))
        for entry in json_data:
            retries = 0
            while retries < max_retries:
                try:
                    generated_qa = model_train.generate_content(entry["text"], generation_config=cfg).text
                    questions_answers_pairs = extract_qa_pairs(generated_qa)
                    # Attach Tags

                    tagged_qa_pairs = [
                        {"question": qa["question"], "answer": qa["answer"], "tags": entry["tags"]}
                        for qa in questions_answers_pairs
                    ]

                    # Split into training and validation sets
                    if len(tagged_qa_pairs) == 4:
                        qa_pairs_training.extend(tagged_qa_pairs[:3])
                        qa_pairs_validation.append(tagged_qa_pairs[3])
                    else:
                        qa_pairs_training.extend(tagged_qa_pairs)


                    print(f"Processed Q&A for entry {t}")
                    t += 1
                    break
                except Exception as e:
                    retries += 1
                    print(f"Error for entry {t} (attempt {retries}): {e}")
                    time.sleep(60)  # Wait before retrying; Retry is usually because of exhausted quota or timeout
                    if retries == max_retries:
                        print(f"Failed to process entry {t} after {max_retries} attempts. Skipping...")
                        t += 1  # Move to next entry after max retries


On Chapter ContextualTags_MIT6390Notes_Chapter1, number of entries:  77
Processed Q&A for entry 0
Processed Q&A for entry 1
Processed Q&A for entry 2
Processed Q&A for entry 3
Processed Q&A for entry 4
Processed Q&A for entry 5
Processed Q&A for entry 6
Processed Q&A for entry 7
Processed Q&A for entry 8
Processed Q&A for entry 9
Processed Q&A for entry 10
Processed Q&A for entry 11
Processed Q&A for entry 12
Processed Q&A for entry 13
Processed Q&A for entry 14
Processed Q&A for entry 15
Processed Q&A for entry 16
Processed Q&A for entry 17
Processed Q&A for entry 18
Processed Q&A for entry 19
Processed Q&A for entry 20
Processed Q&A for entry 21
Error for entry 22 (attempt 1): contents must not be empty
Failed to process entry 22 after 1 attempts. Skipping...
Processed Q&A for entry 23
Processed Q&A for entry 24
Processed Q&A for entry 25
Processed Q&A for entry 26
Processed Q&A for entry 27
Processed Q&A for entry 28
Processed Q&A for entry 29
Processed Q&A for entry 30
Processed Q&



Processed Q&A for entry 39
Error for entry 40 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 40 after 1 attempts. Skipping...
Processed Q&A for entry 41
Processed Q&A for entry 42
Processed Q&A for entry 43
Processed Q&A for entry 44
Processed Q&A for entry 45
Processed Q&A for entry 46
Processed Q&A for entry 47
Processed Q&A for entry 48
Processed Q&A for entry 49
Processed Q&A for entry 50
Processed Q&A for entry 51
Processed Q&A for entry 52
Processed Q&A for entry 53
Processed Q&A for entry 54
Processed Q&A for entry 55
Processed Q&A for entry 56
Processed Q&A for entry 57
Processed Q&A for entry 58
Processed Q&A for entry 59
Processed Q&A for entry 60
Processed Q&A for entry 61
Processed Q&A for entry 62
Processed Q&A for entry 63
Error for entry 64 (attempt 1): contents must not be empty
Failed to process entry

ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 229.56ms


Processed Q&A for entry 128
Processed Q&A for entry 129
Processed Q&A for entry 130
Processed Q&A for entry 131
Processed Q&A for entry 132
Processed Q&A for entry 133
Processed Q&A for entry 134
Error for entry 135 (attempt 1): contents must not be empty
Failed to process entry 135 after 1 attempts. Skipping...
Processed Q&A for entry 136
Processed Q&A for entry 137
Processed Q&A for entry 138


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 130.61ms


Processed Q&A for entry 139
Processed Q&A for entry 140
Processed Q&A for entry 141
Processed Q&A for entry 142
Processed Q&A for entry 143
Processed Q&A for entry 144
Processed Q&A for entry 145
Processed Q&A for entry 146
Processed Q&A for entry 147
Processed Q&A for entry 148
Processed Q&A for entry 149
Processed Q&A for entry 150
On Chapter ContextualTags_MIT6390Notes_Chapter3, number of entries:  129
Processed Q&A for entry 151
Processed Q&A for entry 152
Processed Q&A for entry 153
Processed Q&A for entry 154
Processed Q&A for entry 155
Processed Q&A for entry 156
Processed Q&A for entry 157
Processed Q&A for entry 158




Processed Q&A for entry 159
Error for entry 160 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 160 after 1 attempts. Skipping...
Processed Q&A for entry 161
Processed Q&A for entry 162
Processed Q&A for entry 163
Processed Q&A for entry 164
Processed Q&A for entry 165
Processed Q&A for entry 166
Processed Q&A for entry 167
Processed Q&A for entry 168
Processed Q&A for entry 169
Processed Q&A for entry 170
Processed Q&A for entry 171
Processed Q&A for entry 172
Processed Q&A for entry 173
Processed Q&A for entry 174
Processed Q&A for entry 175
Processed Q&A for entry 176
Processed Q&A for entry 177
Processed Q&A for entry 178
Processed Q&A for entry 179
Processed Q&A for entry 180
Processed Q&A for entry 181
Processed Q&A for entry 182
Processed Q&A for entry 183
Processed Q&A for entry 184




Processed Q&A for entry 185
Error for entry 186 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 186 after 1 attempts. Skipping...
Processed Q&A for entry 187
Processed Q&A for entry 188
Processed Q&A for entry 189
Processed Q&A for entry 190
Processed Q&A for entry 191
Processed Q&A for entry 192
Processed Q&A for entry 193
Processed Q&A for entry 194
Processed Q&A for entry 195
Processed Q&A for entry 196
Processed Q&A for entry 197
Processed Q&A for entry 198
Error for entry 199 (attempt 1): 'text'
Failed to process entry 199 after 1 attempts. Skipping...
Processed Q&A for entry 200
Processed Q&A for entry 201
Processed Q&A for entry 202
Processed Q&A for entry 203
Processed Q&A for entry 204
Processed Q&A for entry 205
Processed Q&A for entry 206
Processed Q&A for entry 207
Processed Q&A for entry 208
Processed Q&A 



Processed Q&A for entry 234
Error for entry 235 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 235 after 1 attempts. Skipping...
Processed Q&A for entry 236
Processed Q&A for entry 237
Processed Q&A for entry 238
Error for entry 239 (attempt 1): contents must not be empty
Failed to process entry 239 after 1 attempts. Skipping...
Error for entry 240 (attempt 1): 'text'
Failed to process entry 240 after 1 attempts. Skipping...
Processed Q&A for entry 241
Processed Q&A for entry 242
Processed Q&A for entry 243
Processed Q&A for entry 244
Processed Q&A for entry 245
Processed Q&A for entry 246
Processed Q&A for entry 247
Processed Q&A for entry 248
Processed Q&A for entry 249
Processed Q&A for entry 250
Processed Q&A for entry 251
Processed Q&A for entry 252
Processed Q&A for entry 253
Processed Q&A for entry 254
Processe



Processed Q&A for entry 256
Error for entry 257 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 180.39ms


Failed to process entry 257 after 1 attempts. Skipping...
Processed Q&A for entry 258
Processed Q&A for entry 259
Processed Q&A for entry 260
Processed Q&A for entry 261
Processed Q&A for entry 262
Processed Q&A for entry 263
Processed Q&A for entry 264
Processed Q&A for entry 265
Processed Q&A for entry 266
Processed Q&A for entry 267
Processed Q&A for entry 268
Processed Q&A for entry 269
Processed Q&A for entry 270
Processed Q&A for entry 271
Processed Q&A for entry 272
Processed Q&A for entry 273
Processed Q&A for entry 274
Processed Q&A for entry 275
Processed Q&A for entry 276
Processed Q&A for entry 277
Processed Q&A for entry 278
Processed Q&A for entry 279
On Chapter ContextualTags_MIT6390Notes_Chapter4, number of entries:  140
Processed Q&A for entry 280
Processed Q&A for entry 281
Processed Q&A for entry 282
Processed Q&A for entry 283




Processed Q&A for entry 284
Error for entry 285 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 285 after 1 attempts. Skipping...
Processed Q&A for entry 286
Processed Q&A for entry 287
Processed Q&A for entry 288


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 130.72ms


Processed Q&A for entry 289
Processed Q&A for entry 290
Processed Q&A for entry 291
Processed Q&A for entry 292
Processed Q&A for entry 293
Processed Q&A for entry 294
Processed Q&A for entry 295
Processed Q&A for entry 296
Processed Q&A for entry 297


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 129.69ms


Processed Q&A for entry 298
Processed Q&A for entry 299
Processed Q&A for entry 300
Processed Q&A for entry 301
Error for entry 302 (attempt 1): 'text'
Failed to process entry 302 after 1 attempts. Skipping...
Processed Q&A for entry 303
Processed Q&A for entry 304
Processed Q&A for entry 305
Processed Q&A for entry 306
Processed Q&A for entry 307
Processed Q&A for entry 308
Error for entry 309 (attempt 1): 'text'
Failed to process entry 309 after 1 attempts. Skipping...
Processed Q&A for entry 310
Processed Q&A for entry 311
Processed Q&A for entry 312
Processed Q&A for entry 313
Processed Q&A for entry 314
Processed Q&A for entry 315
Processed Q&A for entry 316
Processed Q&A for entry 317
Processed Q&A for entry 318
Processed Q&A for entry 319
Processed Q&A for entry 320
Processed Q&A for entry 321
Processed Q&A for entry 322
Processed Q&A for entry 323
Processed Q&A for entry 324
Processed Q&A for entry 325




Processed Q&A for entry 326
Error for entry 327 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 327 after 1 attempts. Skipping...
Error for entry 328 (attempt 1): 'text'
Failed to process entry 328 after 1 attempts. Skipping...
Processed Q&A for entry 329
Processed Q&A for entry 330
Error for entry 331 (attempt 1): 'text'
Failed to process entry 331 after 1 attempts. Skipping...
Processed Q&A for entry 332
Processed Q&A for entry 333
Processed Q&A for entry 334
Processed Q&A for entry 335
Processed Q&A for entry 336
Processed Q&A for entry 337
Error for entry 338 (attempt 1): 'text'
Failed to process entry 338 after 1 attempts. Skipping...
Processed Q&A for entry 339
Processed Q&A for entry 340
Processed Q&A for entry 341


ERROR:tornado.access:503 POST /v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 132.91ms


Processed Q&A for entry 342
Processed Q&A for entry 343
Processed Q&A for entry 344
Processed Q&A for entry 345
Processed Q&A for entry 346
Processed Q&A for entry 347
Processed Q&A for entry 348
Processed Q&A for entry 349
Processed Q&A for entry 350
Processed Q&A for entry 351
Processed Q&A for entry 352
Processed Q&A for entry 353
Processed Q&A for entry 354




Processed Q&A for entry 355
Error for entry 356 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 356 after 1 attempts. Skipping...
Processed Q&A for entry 357
Processed Q&A for entry 358
Processed Q&A for entry 359
Processed Q&A for entry 360
Processed Q&A for entry 361
Processed Q&A for entry 362
Processed Q&A for entry 363
Processed Q&A for entry 364
Processed Q&A for entry 365
Processed Q&A for entry 366
Error for entry 367 (attempt 1): 'text'
Failed to process entry 367 after 1 attempts. Skipping...
Processed Q&A for entry 368
Error for entry 369 (attempt 1): 'text'
Failed to process entry 369 after 1 attempts. Skipping...
Processed Q&A for entry 370
Processed Q&A for entry 371
Processed Q&A for entry 372
Processed Q&A for entry 373
Processed Q&A for entry 374
Processed Q&A for entry 375
Processed Q&A for entry 376




Processed Q&A for entry 386
Error for entry 387 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 387 after 1 attempts. Skipping...
Processed Q&A for entry 388
Processed Q&A for entry 389
Processed Q&A for entry 390
Processed Q&A for entry 391
Processed Q&A for entry 392
Processed Q&A for entry 393
Processed Q&A for entry 394
Processed Q&A for entry 395
Processed Q&A for entry 396
Processed Q&A for entry 397
Processed Q&A for entry 398
Processed Q&A for entry 399
Processed Q&A for entry 400
Processed Q&A for entry 401
Processed Q&A for entry 402
Processed Q&A for entry 403
Processed Q&A for entry 404
Processed Q&A for entry 405
Processed Q&A for entry 406
Processed Q&A for entry 407
Processed Q&A for entry 408
Processed Q&A for entry 409
Processed Q&A for entry 410
Processed Q&A for entry 411
Processed Q&A for entry 412




Processed Q&A for entry 420
Error for entry 421 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 421 after 1 attempts. Skipping...
Processed Q&A for entry 422
Processed Q&A for entry 423
Processed Q&A for entry 424
Processed Q&A for entry 425
Processed Q&A for entry 426
Processed Q&A for entry 427
Processed Q&A for entry 428
Processed Q&A for entry 429
Processed Q&A for entry 430
Processed Q&A for entry 431
Error for entry 432 (attempt 1): 'text'
Failed to process entry 432 after 1 attempts. Skipping...
Processed Q&A for entry 433
Processed Q&A for entry 434
Processed Q&A for entry 435
Processed Q&A for entry 436
Processed Q&A for entry 437
Processed Q&A for entry 438
Processed Q&A for entry 439
Processed Q&A for entry 440
Processed Q&A for entry 441
Processed Q&A for entry 442
Processed Q&A for entry 443
Processed Q&A 



Processed Q&A for entry 512
Error for entry 513 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 513 after 1 attempts. Skipping...
Processed Q&A for entry 514
Processed Q&A for entry 515
Processed Q&A for entry 516
Processed Q&A for entry 517
Processed Q&A for entry 518
Processed Q&A for entry 519
Processed Q&A for entry 520
Processed Q&A for entry 521
Processed Q&A for entry 522
Processed Q&A for entry 523
Processed Q&A for entry 524
Processed Q&A for entry 525
Processed Q&A for entry 526
Processed Q&A for entry 527
Processed Q&A for entry 528
Processed Q&A for entry 529
Processed Q&A for entry 530
Error for entry 531 (attempt 1): 'text'
Failed to process entry 531 after 1 attempts. Skipping...
Processed Q&A for entry 532
Processed Q&A for entry 533
Error for entry 534 (attempt 1): contents must not be empty
Failed to 



Processed Q&A for entry 549
Error for entry 550 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 550 after 1 attempts. Skipping...
Processed Q&A for entry 551
Processed Q&A for entry 552
Processed Q&A for entry 553
Processed Q&A for entry 554
Processed Q&A for entry 555
Processed Q&A for entry 556
Processed Q&A for entry 557
Processed Q&A for entry 558
Processed Q&A for entry 559
Processed Q&A for entry 560
Processed Q&A for entry 561
Processed Q&A for entry 562
Processed Q&A for entry 563
Processed Q&A for entry 564
Processed Q&A for entry 565
Processed Q&A for entry 566
Processed Q&A for entry 567
Processed Q&A for entry 568
Processed Q&A for entry 569
Processed Q&A for entry 570
Processed Q&A for entry 571
Processed Q&A for entry 572




Processed Q&A for entry 573
Error for entry 574 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 574 after 1 attempts. Skipping...
Processed Q&A for entry 575
Processed Q&A for entry 576
Processed Q&A for entry 577
Processed Q&A for entry 578
Processed Q&A for entry 579
Processed Q&A for entry 580
Processed Q&A for entry 581
Processed Q&A for entry 582
Error for entry 583 (attempt 1): contents must not be empty
Failed to process entry 583 after 1 attempts. Skipping...
Processed Q&A for entry 584
Processed Q&A for entry 585
Processed Q&A for entry 586
Processed Q&A for entry 587
Processed Q&A for entry 588
Processed Q&A for entry 589
Processed Q&A for entry 590
Processed Q&A for entry 591
Processed Q&A for entry 592
Processed Q&A for entry 593
On Chapter ContextualTags_MIT6390Notes_Chapter6, number of entries:  68
Proces



Processed Q&A for entry 605
Error for entry 606 (attempt 1): 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
Failed to process entry 606 after 1 attempts. Skipping...
Processed Q&A for entry 607
Processed Q&A for entry 608
Processed Q&A for entry 609
Processed Q&A for entry 610
Processed Q&A for entry 611
Processed Q&A for entry 612
Processed Q&A for entry 613
Error for entry 614 (attempt 1): 'text'
Failed to process entry 614 after 1 attempts. Skipping...
Processed Q&A for entry 615
Error for entry 616 (attempt 1): 'text'
Failed to process entry 616 after 1 attempts. Skipping...
Processed Q&A for entry 617
Processed Q&A for entry 618
Processed Q&A for entry 619
Processed Q&A for entry 620
Processed Q&A for entry 621
Error for entry 622 (attempt 1): 'text'
Failed to process entry 622 after 1 attempts. Skipping...
Processed Q&A for entry 623
Error for entr

KeyboardInterrupt: 

In [8]:
qa_pairs_training[0:5]

[{'question': 'What is the primary goal of machine learning (ML)?',
  'answer': 'The primary goal of machine learning is to make decisions or predictions based on data.',
  'tags': ['introduction']},
 {'question': 'How does the focus of machine learning differ from fields like economics, psychology, and statistics?',
  'answer': 'While economics, psychology, and statistics aim to discover underlying causal processes or find well-fitting models, machine learning focuses on using models to make good predictions or decisions.  The model is a means to an end, not the end itself.',
  'tags': ['introduction']},
 {'question': 'What is considered the end product in fields like economics and psychology, in contrast to machine learning?',
  'answer': 'In economics and psychology, the end product is a model that explains underlying causal processes.  In machine learning, the model is a tool used to achieve the goal of making predictions or decisions.',
  'tags': ['introduction']},
 {'question': '

In [9]:
len(qa_pairs_training)

1551

In [10]:
len(qa_pairs_validation)

517

In [11]:
qa_pairs_validation[0:5]

[{'question': 'Does machine learning involve fitting models?  If so, what is their purpose?',
  'answer': 'Yes, machine learning often involves fitting models, but these models are used as a means to make accurate predictions or effective decisions, rather than being the primary focus of the study.',
  'tags': ['introduction']},
 {'question': 'What is the main point the text makes about the role of machine learning in modern applications?',
  'answer': 'The text argues that machine learning has become the best approach for many applications due to its improvements in capability and scope, offering speed, efficiency, and robust solutions.',
  'tags': ['introduction']},
 {'question': 'What are some of the human-driven tasks involved in the machine learning process besides selecting an algorithm?',
  'answer': 'Human tasks include problem framing, data acquisition and organization, designing possible solutions, validating results, and assessing the impact on people.',
  'tags': ['introduc

In [12]:
# Save to JSON files
output_folder = "/content/drive/MyDrive/DeepLearning_FinalProject/Final Report Content/GeminiPrompt_QuestionGeneration_and_Training_Validation_Split"  # Ensure this folder exists or change it as needed

with open(output_folder + "question_answer_pair_training_data.json", "w") as train_file:
    json.dump(qa_pairs_training, train_file, indent=4)

with open(output_folder + "question_answer_pair_validation_data.json", "w") as val_file:
    json.dump(qa_pairs_validation, val_file, indent=4)

print(f"Saved {len(qa_pairs_training)} training samples to training_data.json")
print(f"Saved {len(qa_pairs_validation)} validation samples to validation_data.json")


Saved 1551 training samples to training_data.json
Saved 517 validation samples to validation_data.json


In [13]:
print(entry)

{'type': 'text', 'text': 'Consider a max pooling layer where both the strides and $\\mathtt{k}$ are set to be 2. This would map a $64\\times64\\times3$ image to a $32\\times32\\times3$ image. Note that max pooling layers do not have additional bias or offset values. ', 'page_idx': 4, 'tags': ['max_pooling']}
