In [1]:
from helpers.models import Models
from helpers.llm_client import LLMClient
from helpers.parsers import license_parser
import pandas as pd
import os
import nirjas

pd.set_option('display.max_rows', None)    # Show all rows
# pd.set_option('display.max_colwidth', None)  # Show full column width

  from tqdm.autonotebook import tqdm, trange


In [3]:
# Check to make sure that all API keys are present
os.environ['GROQ_API_KEY'] 
os.environ['NVIDIA_API_KEY']
os.environ['TOGETHER_API_KEY']    
'OK'
#

'OK'

In [11]:
def prompt_function(text):
    return f"""
    [Task]
    You are provided with text extracted from a file, along with potential license matches identified by a semantic search tool.
    Your task is to carefully analyze the provided text and metadata to determine the actual software license(s) present in the original file.
    Out of the 10 provided lines, not all matches will be correct or relevant, so focus on the most relevant lines in your analysis.

    [Metadata Explanation]
    The metadata provided for each line is a tuple containing four elements:
        * **Line:** The actual line of text extracted from the file.
        * **Potential License Match:** The name of a license that the semantic search tool believes the line might belong to.
        * **License ID:** The SPDX identifier of the potential license match.
        * **Matched License Text:** The specific text within the potential license that the line was matched to.

    [Guidelines]
    1. **License Identification:** If a license is found, clearly state its name and its corresponding SPDX identifier (e.g., MIT License, SPDX-License-Identifier: MIT). If multiple licenses are found, list them all.
    2. **Evidence and Reasoning (Focus on Relevance and Clarity):** 
        * For each identified license, extract the specific text snippet(s) from the provided text that confirm its presence. Include surrounding context if it helps clarify the license's applicability. Prioritize the most relevant lines of text.
        * Explain why the identified license is the most likely match, taking into account the potential license matches and the matched license text provided in the metadata.
        * Only consider matches that are clear and obviously correct. The semantic search tool will always attempt to match lines to licenses, but these matches are not always accurate.
    3. **Override Semantic Search:** If the semantic search tool's suggested match seems incorrect, feel free to disregard it and rely on your own knowledge and analysis to determine the correct license. Provide a clear explanation of why you chose a different license.
    4. **Exclude Irrelevant Information:**
        * Disregard copyright notices and statements and lines of code as they do not indicate the software license.
        * Focus only on text that is found in licenses or clearly identifies licenses.
    5. **No License Scenario:** If no license is detected in the text, explicitly state "No software license found."
    6. **Ambiguity:** If the license cannot be confidently determined due to ambiguity or conflicting information, clearly state this and provide an explanation.
    7. **Response Format:** Provide the results in the following format:
        *   **Licenses = [list of identified licenses]** 
        *   **SPDX-IDs = [list of corresponding SPDX identifiers]**

        If no licenses are found, both lists should be empty:

        *   **Licenses = []**
        *   **SPDX-IDs = []** 

    [Text and Metadata]
    {text}
    """


In [12]:
def prompt_function_1(text):
    return f"""
    [Task]
    Identify the software license(s) present in the following text.

    [Metadata]
    Each line includes potential license matches. These are not always accurate.

    [Guidelines]
    1.  **Focus on Relevance:** Prioritize the most relevant lines and matches.
    2.  **Override Matches:** Disregard incorrect suggestions.
    3.  **Ignore Irrelevant Information:** Disregard copyright notices and code.
    4.  **No License:** If no license is found, state "No software license found."
    5.  **Ambiguity:** If uncertain, explain the ambiguity.
    6.  **Response Format:**
        *   **Licenses = [list of identified licenses]**
        *   **SPDX-IDs = [list of corresponding SPDX identifiers]**

    [Text and Metadata]
    {text}
    """

In [13]:
def prompt_function_2(text):
    return f"""
    [Task]
    Identify the software license(s) present in the following text.

    [Metadata]
    Each line includes potential license matches. These are not always accurate.

    [Guidelines]
    1.  **Focus on License-Specific Text:** Consider only lines that:
        *   Explicitly mention license terms (e.g., "SPDX-License-Identifier," "License," "copyright," "distribution," "modification").
        *   Directly quote from known license texts (e.g., "Permission is hereby granted...").

    2.  **Ignore Irrelevant Information:** 
        *   Disregard copyright notices, code snippets, comments, and general documentation.
        *   If a match seems unrelated to licensing, ignore it.
    
    3.  **No License:** If no license is found, state "No software license found."
    4.  **Ambiguity:** If uncertain, explain the ambiguity.
    5.  **Response Format:**
        *   **Licenses = [list of identified licenses]**
        *   **SPDX-IDs = [list of corresponding SPDX identifiers]**

    [Text and Metadata]
    {text}
    """


In [14]:
def prompt_function_3(text):
    return f"""
    [Task]
    Identify the software license(s) present in the following text.

    [Metadata]
    Each line includes potential license matches. These are not always accurate.

    [Guidelines]
    1. **Focus on License-Specific Text:** Consider only lines that:
        * Explicitly mention license terms (e.g., "SPDX-License-Identifier," "License," "distribution," "modification").
        * Directly quote from known license texts (e.g., "Permission is hereby granted...").
        * Include specific license references or titles.

        **Examples of relevant lines:**
        * "SPDX-License-Identifier: GPL-2.0-only"
        * "This software is licensed under the MIT License."
        * "Permission is hereby granted, free of charge, to any person obtaining a copy of this software..."

    2. **Ignore Irrelevant Information:** 
        * Disregard copyright notices, code snippets, comments, and general documentation.
        * If a match seems unrelated to licensing, ignore it.

        **Examples of irrelevant lines:**
        * "Setup of a region of guest memory for the MSR bitmap."
        * "Allocate memory regions for nested VMX tests."
        * "tools/testing/selftests/kvm/lib/x86_64/vmx.c** Copyright (C) 2018, Google LLC."

    3. **No License:** If no license is found, state "No software license found."
    4. **Ambiguity:** If uncertain, explain the ambiguity.
    5. **Response Format:**
        * **Licenses = [list of identified licenses]**
        * **SPDX-IDs = [list of corresponding SPDX identifiers]**

    [Text and Metadata]
    {text}
    """


In [15]:
def prompt_function_4(text):
    return f"""
    [Task]
    From the following lines, select those that are relevant to software licensing and ignore the rest.

    [Metadata]
    The metadata provided for each line is a tuple containing four elements:
        * **Line:** The actual line of text extracted from the file.
        * **Potential License Match:** The name of a license that the semantic search tool believes the line might belong to.
        * **License ID:** The SPDX identifier of the potential license match.
        * **Matched License Text:** The specific text within the potential license that the line was matched to.

    [Guidelines]
    1. **Select License-Specific Lines:** Choose only lines that:
        * Explicitly mention license terms (e.g., "SPDX-License-Identifier," "License," "distribution," "modification").
        * Directly quote from known license texts (e.g., "Permission is hereby granted...").
        * Include specific license references or titles.

        **Examples of relevant lines:**
        * "SPDX-License-Identifier: GPL-2.0-only"
        * "This software is licensed under the MIT License."
        * "Permission is hereby granted, free of charge, to any person obtaining a copy of this software..."

    2. **Ignore Irrelevant Lines:** 
        * Disregard copyright notices, code snippets, comments, and general documentation.
        * Ignore lines that do not specifically reference or include license terms.

        **Examples of irrelevant lines:**
        * "Setup of a region of guest memory for the MSR bitmap."
        * "Allocate memory regions for nested VMX tests."
        * "tools/testing/selftests/kvm/lib/x86_64/vmx.c** Copyright (C) 2018, Google LLC."

    3. **No License:** If no license is found, state "No software license found."
    4. **Ambiguity:** If uncertain, explain the ambiguity.
    5. **Response Format:**
        * **Relevant Lines = [list of relevant lines]**
        * **Licenses = [list of identified licenses from relevant lines]**
        * **SPDX-IDs = [list of corresponding SPDX identifiers from relevant lines]**

    [Text and Metadata]
    {text}
    """


In [22]:
def prompt_function_5(text):
    return f"""
    [Task]
    From the following tuples, select those that are relevant to software licensing and ignore the rest.
    A relevant tuple is a tuple that contains a line of text that is relevant and can be used to identify a license.

    [Tuples]
    Each tuple consists of three elements:
        1. **Line:** The actual line of text extracted from the file. This is the element you need to evaluate for relevance to software licensing.
        2. **Potential License Match:** The name of a license that the semantic search tool suggests the line might belong to (provided for reference).
        3. **License ID:** The SPDX identifier of the potential license match (provided for reference).
        
    [Guidelines]
    1. **Select License-Specific Lines:** Choose only lines that:
        * Explicitly mention license terms 
        * Directly quote from known license texts
        * Include specific license references or titles.

    2. **Ignore Irrelevant Lines:** 
        * Disregard lines that do not explicitly mention license terms.
        * Ignore copyright notices, code snippets, comments, and general documentation.
        * Ignore code documentation lines that seem to be documenting code or just general instructions or information.
        * Do not select lines that are general descriptions, code, or comments unrelated to license terms.

    3. **No License:** If no license is found, state "No software license found."
    4. **Ambiguity:** If uncertain, explain the ambiguity.
    5. **Response Format:**
        * **Relevant Lines = [list of relevant lines]**
        * **Licenses = [list of identified licenses from relevant lines]**
        * **SPDX-IDs = [list of corresponding SPDX identifiers from relevant lines]**

    [Text and Metadata]
    {text}
    """


In [17]:
df = pd.read_csv('extras/dataset-labeled-1-75.csv')
client = LLMClient()

In [None]:
sampled_data_mistral_7b = client.process_dataset(df.loc[0:19], df_path='dataset-labeled-1-75.csv',
                                    model=Models.MISTRAL_7b,
                                    prompt_function=prompt_function_1,
                                    parser=license_parser,
                                    extra_file_path='extras',
                                    log_every=5,
                                )

In [32]:
sampled_data_gemma_2_9b = client.process_dataset(df.loc[0:4], df_path='dataset-labeled-1-75.csv',
                                    model=Models.GEMMA_2_9b,
                                    prompt_function=prompt_function_5,
                                    parser=license_parser,
                                    extra_file_path='extras',
                                    log_every=5,
                                    output_name='test'
                                )

Loading pre-embedded licenses from: /home/jimbo/Desktop/GSoC24/repo/GSoC24/extras/license_information/license_embeddings/768_all-mpnet-base-v2-license-embedding.pkl


[32m2024-07-04 13:21:19.608[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m226[0m - [1mProcessing index: 0[0m
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
print(sampled_data_gemma_2_9b.loc[1, 'response'])

**Relevant Lines = [('SPDX-License-Identifier: GPL-2.0-only',), ('Setup of a region of guest memory for the vmxon region.',), ('Setup of a region of guest memory for the shadow VMCS.',), ('Allocate memory regions for nested VMX tests.',), ('Load a VMCS.',)]**

**Licenses = ['GNU General Public License v2.0 only', 'PolyForm Noncommercial License 1.0.0', 'Affero General Public License v1.0 or later', 'mailprio License']**

**SPDX-IDs = ['GPL-2.0-only', 'PolyForm-Noncommercial-1.0.0', 'AGPL-1.0-or-later', 'mailprio']** 


Let me explain the selections:

* **('SPDX-License-Identifier: GPL-2.0-only',)**: This line explicitly states the license identifier, making it highly relevant.
* **('Setup of a region of guest memory for the vmxon region.',)**:  While not directly mentioning a license, this line appears within a context where license information is likely to be found (e.g., within a file dedicated to license details).
* **('Setup of a region of guest memory for the shadow VMCS.',)**: Si

In [None]:
sampled_data_phi_3_mini = client.process_dataset(df.loc[0:19], df_path='dataset-labeled-1-75.csv',
                                    model=Models.PHI_3_MINI,
                                    prompt_function=prompt_function_1,
                                    parser=license_parser,
                                    extra_file_path='extras',
                                    log_every=5,
                                )

In [None]:
sampled_data_phi_3_small = client.process_dataset(df.loc[0:19], df_path='dataset-labeled-1-75.csv',
                                    model=Models.PHI_3_SMALL,
                                    prompt_function=prompt_function_1,
                                    parser=license_parser,
                                    extra_file_path='extras',
                                    log_every=5,
                                )

In [28]:
sampled_data_llama_3_8b = client.process_dataset(df.loc[0:4], df_path='dataset-labeled-1-75.csv',
                                    model=Models.LLAMA_3_8b,
                                    prompt_function=prompt_function_5,
                                    parser=license_parser,
                                    extra_file_path='extras',
                                    log_every=5,
                                    output_name='test'
                                )

Loading pre-embedded licenses from: /home/jimbo/Desktop/GSoC24/repo/GSoC24/extras/license_information/license_embeddings/768_all-mpnet-base-v2-license-embedding.pkl


[32m2024-07-04 13:18:23.762[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m226[0m - [1mProcessing index: 0[0m


In [35]:
print(sampled_data_llama_3_8b.loc[1, 'response'])

**Relevant Lines = [('SPDX-License-Identifier: GPL-2.0-only',), ('Setup of a region of guest memory for the vmxon region.',), ('Setup of a region of guest memory for the shadow VMCS.',), ('Allocate memory regions for nested VMX tests.',), ('Load a VMCS.',)]**

**Licenses = ['GNU General Public License v2.0 only', 'PolyForm Noncommercial License 1.0.0', 'Affero General Public License v1.0 or later', 'mailprio License']**

**SPDX-IDs = ['GPL-2.0-only', 'PolyForm-Noncommercial-1.0.0', 'AGPL-1.0-or-later', 'mailprio']** 


Let me explain the selections:

* **('SPDX-License-Identifier: GPL-2.0-only',)**: This line explicitly states the license identifier, making it highly relevant.
* **('Setup of a region of guest memory for the vmxon region.',)**:  While not directly mentioning a license, this line appears within a context where license information is likely to be found (e.g., within a file dedicated to license details).
* **('Setup of a region of guest memory for the shadow VMCS.',)**: Si

In [27]:
print(sampled_data_llama_3_8b.loc[1, 'file_comments'])

SPDX-License-Identifier: GPL-2.0-only
tools/testing/selftests/kvm/lib/x86_64/vmx.c** Copyright (C) 2018, Google LLC.
KVM should return supported EVMCS version range
Allocate memory regions for nested VMX tests.** Input Args:*   vm - The VM to allocate guest-virtual addresses in.** Output Args:*   p_vmx_gva - The guest virtual address for the struct vmx_pages.** Return:*   Pointer to structure with the addresses of the VMX areas.
Setup of a region of guest memory for the vmxon region.
Setup of a region of guest memory for a vmcs.
Setup of a region of guest memory for the MSR bitmap.
Setup of a region of guest memory for the shadow VMCS.
Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps.
Ensure bits in CR0 and CR4 are valid in VMX operation:* - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.* - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
Enable VMX operation
Configure IA32_FEATURE_CONTROL MSR to allow VMXON:*  Bit 0: Lock bit. If clear, VMXON causes a #GP.*  

In [None]:
df.loc[1, 'file_comments']

In [None]:
print(sampled_data_mistral_7b.loc[1, 'response'])

In [None]:
sampled_data_mistral_7b = sampled_data_mistral_7b.rename(columns={'response': 'mistral_response'})
sampled_data_llama_3_8b = sampled_data_llama_3_8b.rename(columns={'response': 'llama_response'})
sampled_data_gemma_1_7b = sampled_data_gemma_1_7b.rename(columns={'response': 'gemma_response'})
merged_df = pd.merge(sampled_data_llama_3_8b, sampled_data_mistral_7b, on='file path').merge(sampled_data_gemma_1_7b, on='file path')
merged_df[['scan results_x', 'llama_response', 'mistral_response', 'gemma_response']]