In [1]:
!pip install google-generativeai
!pip install google-cloud-aiplatform



In [2]:
import google.cloud.aiplatform as aiplatform

In [3]:
import google.generativeai as genai
from google.colab import userdata

gemini_key = userdata.get('gemini_key')

genai.configure(api_key=gemini_key)

In [4]:
# configuration du modèle
generation_config = {
  "temperature": 0.9,
  "top_p": 1,
  "top_k": 5,
  "max_output_tokens": 2048,
}

In [5]:
# appel au modèle generative avec la configuration déjà fixée
model = genai.GenerativeModel(model_name= "gemini-1.0-pro",
                              generation_config = generation_config)


In [6]:
import pandas as pd
df=pd.read_csv("/content/DatasetGen.csv")
df.head()

Unnamed: 0,Question,Option A,Option B,Option C,Option D,GT
0,Which organization developed the MITRE ATT&CK ...,Microsoft,The MITRE Corporation (Correct Answer),Google,IBM,B
1,Which organization is responsible for maintain...,NSA (National Security Agency),FBI (Federal Bureau of Investigation),MITRE Corporation,DHS (Department of Homeland Security),C
2,Which section of the document provides details...,3.4.2 Technique and Sub-Technique Object Struc...,3.5 Groups,3.7 Mitigations,3.8 ATT&CK Object Model Relationships,D
3,Which section of the document discusses the Ad...,3.9.1.5 Deprecation,4.1 Conceptual,4.2 Tactics,4.3 Techniques and Sub-Techniques,B
4,Which section of the document outlines the pro...,4.3.1.4 Adversary Use,4.3.3 Enhancing Existing Techniques,4.3.6 Examples of Applying the Methodology for...,4.3.4 Named Adversary Groups Using Techniques,C


In [7]:
def get_model_response(question, options):
    prompt = (
        f"You are given a multiple-choice question (MCQ) from a Cyber Threat Intelligence (CTI) knowledge benchmark dataset.\n"
        f"Your task is to choose the best option among the four provided.\n\n"
        f"Question: {question}\n"
        f"A. {options[0]}\n"
        f"B. {options[1]}\n"
        f"C. {options[2]}\n"
        f"D. {options[3]}\n\n"
        f"Return your answer as a single uppercase letter: A, B, C, or D."
    )
    max_retries = 5
    backoff_time = 10  # initial backoff time in seconds

    for attempt in range(max_retries):
        try:
            # Generate the response
            response = model.generate_content(prompt)  # Replace with actual API call

            # Log the full response for debugging
            print(f"Response object: {response}")

            # Check if response.parts exists and is not empty
            if not hasattr(response, 'parts') or not response.parts:
                raise ValueError("No valid response parts found.")

            # Extract the text from the response object
            text = response.parts[0].text
            answer = text.strip()[-1].upper()  # Apply strip() to the extracted text
            return answer
        except TooManyRequests:
            if attempt < max_retries - 1:
                sleep_time = backoff_time * (2 ** attempt)  # Exponential backoff
                print(f"Rate limit hit. Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                raise
        except Exception as e:
            print(f"An error occurred: {e}")
            raise

In [9]:
sampled_df = df.sample(n=10, random_state=42)

# Initialize an empty list to store the output data
output_data = []

# Iterate over the sampled DataFrame rows
for index, row in sampled_df.iterrows():
    question = row['Question']
    options = [
        row['Option A'],
        row['Option B'],
        row['Option C'],
        row['Option D']
    ]

    # Get the model response for the current question
    try:
        response_text = get_model_response(question, options)
        print(f"Question {index + 1}: {response_text}")

        # Append the question and response to the output_data list
        output_data.append({
            'Question': question,
            'Option A': options[0],
            'Option B': options[1],
            'Option C': options[2],
            'Option D': options[3],
            'Geminai': response_text
        })
    except Exception as e:
        print(f"Failed to get a response for question {index + 1}: {e}")
        continue

# Create a new DataFrame from the output_data
output_df = pd.DataFrame(output_data)

# Save the new DataFrame to a CSV file
output_df.to_csv('geminai.csv', index=False)

Response object: response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "C"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "index": 0,
          "safety_ratings": [
            {
              "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HATE_SPEECH",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HARASSMENT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
              "probability": "NEGLIGIBLE"
            }
          ]
        }
      ],
      "usage_metadata": {
        "prompt_to

In [10]:
answer=pd.read_csv("/content/geminai.csv")
answer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  10 non-null     object
 1   Option A  10 non-null     object
 2   Option B  10 non-null     object
 3   Option C  10 non-null     object
 4   Option D  10 non-null     object
 5   Geminai   10 non-null     object
dtypes: object(6)
memory usage: 608.0+ bytes


In [11]:
df_merged = pd.merge(df, answer, on='Question', how='inner')
df_merged.head()

Unnamed: 0,Question,Option A_x,Option B_x,Option C_x,Option D_x,GT,Option A_y,Option B_y,Option C_y,Option D_y,Geminai
0,Which of the following factors should a contro...,The size of the organization,Whether the data subject has previously given ...,The possible consequences of the intended furt...,The number of data processing operations condu...,C,The size of the organization,Whether the data subject has previously given ...,The possible consequences of the intended furt...,The number of data processing operations condu...,C
1,Which of the following is a challenge to infor...,Establishing Trust,Achieving Interoperability and Automation,Safeguarding Sensitive Information,All of the above,D,Establishing Trust,Achieving Interoperability and Automation,Safeguarding Sensitive Information,All of the above,D
2,Which of the following is a challenge to infor...,Establishing trust,Achieving agility,Safeguarding public information,Building economies of scale,A,Establishing Trust,Achieving Interoperability and Automation,Safeguarding Sensitive Information,All of the above,D
3,What is one of the key benefits of data fusion...,Identifying random and isolated cyber attacks,Revealing coordinated actions and campaigns by...,Preventing all types of malware attacks,Sharing data only within individual organizations,B,Identifying random and isolated cyber attacks,Revealing coordinated actions and campaigns by...,Preventing all types of malware attacks,Sharing data only within individual organizations,B
4,Which factor is NOT considered when including ...,Objective,Actions,Target,Requirements,C,Objective,Actions,Target,Requirements,D


In [12]:
def compute_mcq_accuracy(df):
    correct = 0
    total = 0
    for idx, row in df.iterrows():
        pred = row['Geminai']
        gt = row['GT']
        if pred in ['A', 'B', 'C', 'D', 'X']:
            total += 1
        else:
            print('Invalid response at row {}'.format(idx+1))
        if pred == gt:
            correct += 1
    return correct/total*100

In [13]:
print('Accuracy:', compute_mcq_accuracy(df_merged))

Accuracy: 81.81818181818183


## Génération avec flux
Par défaut, le modèle renvoie une réponse à la fin du processus de génération. Vous pouvez également diffuser la réponse en flux continu au fur et à mesure de sa génération. Le modèle renvoie alors des fragments de la réponse dès qu'ils sont générés.