In [1]:
from helpers.models import Models
from helpers.llm_client import LLMClient
from helpers.parsers import license_parser
import pandas as pd
import os
import nirjas

pd.set_option('display.max_rows', None)    # Show all rows
# pd.set_option('display.max_colwidth', None)  # Show full column width

  from tqdm.autonotebook import tqdm, trange


### API KEYs Validation

In [2]:
# Check to make sure that all API keys are present
os.environ['GROQ_API_KEY'] 
os.environ['NVIDIA_API_KEY']
os.environ['TOGETHER_API_KEY']    
'OK'
#

'OK'

In [3]:
# If the keys are not present, add these lines to ~/.bashrc
# sudo gedit ~/.bashrc
# ! Add your own API keys here, currently all 3 give a level of free access with GRoQ being the most free
# export GROQ_API_KEY = ""
# export NVIDIA_API_KEY = ""
# export TOGETHER_API_KEY = ""

In [5]:
def prompt_function(text):
    return f"""
    [Task]
    Carefully analyze the provided text to determine if it contains any software licenses.

    [Guidelines]
    1.  **License Identification:** If a license is found, clearly state its name and its corresponding SPDX identifier (e.g., MIT License, SPDX-License-Identifier: MIT).
    2.  **Evidence Extraction:** For each identified license, extract the specific text snippet(s) from the provided text that confirm its presence. Include surrounding context if it helps clarify the license's applicability.
    3.  **No License Scenario:** If no license is detected in the text, explicitly state "No software license found."
    4.  **Response Format:** Provide the results in the following format:
        *   **Licenses = [list of identified licenses]** 
        *   **SPDX-IDs = [list of corresponding SPDX identifiers]**

        If no licenses are found, both lists should be empty:

        *   **Licenses = []**
        *   **SPDX-IDs = []** 

    [Text]
    {text}
    """

In [6]:
client = LLMClient()

In [8]:
df = pd.read_csv('extras/pytorch-main.csv')
# df['file text'] = None
# for index, row in df.iterrows():
#     file_path = row['file path']
#     try:
#         with open(file_path, 'r') as file:
#             content = file.read()
#             df.loc[index, 'file text'] = content
#     except:
#         pass
# df = df[df['file text'].notna()]
def sample_per_label(df, label_column, max_samples=5):
    sampled_df = df.groupby(label_column).apply(
        lambda x: x.sample(n=min(max_samples, len(x)))  # Sample at most 5 or all if fewer
    )
    return sampled_df.reset_index(drop=True)  # Reset the index for a clean DataFrame

# Sample your DataFrame
sampled_data = sample_per_label(df, 'scan results')
sampled_data = sampled_data.loc[0:32]

In [9]:
sampled_data.loc[0]

file path            pytorch-main/torch/csrc/utils/pythoncapi_compat.h
scan results                                                  0BSD BSD
concluded results                                                  NaN
Name: 0, dtype: object

In [9]:
# df = df[df['scan results'] == 'BSD-style'].sample(n=4, random_state=42)
# df.reset_index(inplace=True)
sampled_data_mistral_7b = client.process_dataset(sampled_data, df_path='pytorch-main.csv',
                                    model=Models.MISTRAL_7b,
                                    prompt_function=prompt_function,
                                    parser=license_parser,
                                    extra_file_path='extras',
                                    log_every=5,
                                )

[32m2024-06-06 12:37:58.034[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 0[0m
[32m2024-06-06 12:38:07.307[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 5[0m
[32m2024-06-06 12:38:14.613[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 10[0m
[32m2024-06-06 12:38:42.489[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 15[0m
[32m2024-06-06 12:38:50.538[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 20[0m
[32m2024-06-06 12:39:13.782[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 25[0m
[32m2024-06-06 12:39:23.271[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_d

In [10]:
sampled_data_llama_3_8b = client.process_dataset(sampled_data, df_path='pytorch-main.csv',
                                    model=Models.LLAMA_3_8b,
                                    prompt_function=prompt_function,
                                    parser=license_parser,
                                    log_every=5)

[32m2024-06-06 12:39:33.681[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 0[0m
[32m2024-06-06 12:39:49.158[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 5[0m
[32m2024-06-06 12:39:54.633[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 10[0m
[32m2024-06-06 12:40:02.085[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 15[0m
[32m2024-06-06 12:40:13.356[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 20[0m
[32m2024-06-06 12:40:30.372[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_dataset[0m:[36m173[0m - [1mProcessing index: 25[0m
[32m2024-06-06 12:40:38.052[0m | [1mINFO    [0m | [36mhelpers.llm_client[0m:[36mprocess_d

In [None]:
sampled_data_gemma_1_7b = client.process_dataset(sampled_data, df_path='pytorch-main.csv',
                                    model=Models.GEMMA_7b,
                                    prompt_function=prompt_function,
                                    parser=license_parser,
                                    log_every=5)

In [46]:
sampled_data_gemma_1_7b = pd.read_csv('results/pytorch-main-GEMMA_7b.csv')

In [48]:
sampled_data_mistral_7b = sampled_data_mistral_7b.rename(columns={'response': 'mistral_response'})
sampled_data_llama_3_8b = sampled_data_llama_3_8b.rename(columns={'response': 'llama_response'})
sampled_data_gemma_1_7b = sampled_data_gemma_1_7b.rename(columns={'response': 'gemma_response'})
merged_df = pd.merge(sampled_data_llama_3_8b, sampled_data_mistral_7b, on='file path').merge(sampled_data_gemma_1_7b, on='file path')
merged_df[['scan results_x', 'llama_response', 'mistral_response', 'gemma_response']]

Unnamed: 0,scan results_x,llama_response,mistral_response,gemma_response
0,0BSD BSD,**Licenses = [Zero Clause BSD (0BSD)]**\n**SPD...,**Licenses = [Zero Clause BSD (0BSD)]**\n**SPD...,**Licenses = [ 'Zero Clause BSD (0BSD)' ]**\n*...
1,Apache-2.0,"**Licenses = [Apache License, Version 2.0]**\n...","**Licenses = [Apache License, Version 2.0]**\n...","**Licenses = [Apache License, Version 2.0]**\n..."
2,Apache-2.0,**Licenses = [Apache-2.0]**\n**SPDX-IDs = [Apa...,**Licenses = [Apache-2.0]**\n**SPDX-IDs = [Apa...,**Licenses = [Apache-2.0]**\n**SPDX-IDs = [Apa...
3,Apache-2.0 BSD-3-Clause BSL-1.0,"**Licenses = [BSD License, Apache License, Boo...","**Licenses = [BSD License, Apache License, Boo...",**Licenses:**\n\n- Licenses = [Early BSD Licen...
4,Apache-possibility,**Licenses = [Apache License]**\n**SPDX-IDs = ...,**Licenses = [Apache License]**\n**SPDX-IDs = ...,**Licenses = [Apache License]**\n**SPDX-IDs = ...
5,BSD,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n\n**SPDX-IDs = [...
6,BSD,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n\n**SPDX-IDs = [...
7,BSD,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...
8,BSD,**Licenses = []**\n**SPDX-IDs = []**\n\nNo sof...,**Licenses = []**\n**SPDX-IDs = []**\n\nNo sof...,**Licenses = []**\n**SPDX-IDs = []**\n\n**Expl...
9,BSD,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n**SPDX-IDs = [BS...,**Licenses = [BSD License]**\n\n**SPDX-IDs = [...


In [None]:
sampled_data_mistral_7b[['scan results_x', 'response']]

In [16]:
# response = client._infer(Models.LLAMA_3_8b, prompt_function(client._extract_comments(nirjas.extract(df.loc[1374, 'file path']))))

In [None]:
import re

def parse_license_info(response_text):
    """
    Extracts license names and SPDX IDs from the structured response of the prompt.

    Args:
        response_text: The text output from the prompt.

    Returns:
        A dictionary with keys "Licenses" and "SPDX-IDs", containing the extracted information as lists.
    """
    license_pattern = r"Licenses\s*=\s*\[(.*?)\]"
    spdx_pattern = r"SPDX-IDs\s*=\s*\[(.*?)\]"

    license_matches = re.findall(license_pattern, response_text)
    spdx_matches = re.findall(spdx_pattern, response_text)

    licenses = []
    spdx_ids = []

    if license_matches:
        licenses = license_matches[0].split(", ")  # Split licenses into a list
    if spdx_matches:
        spdx_ids = spdx_matches[0].split(", ")  # Split SPDX IDs into a list

    return {"Licenses": licenses, "SPDX-IDs": spdx_ids}


In [None]:
sampled_data = sampled_data[sampled_data['response'].notna()]
sampled_data['response_parsed'] = sampled_data['response'].apply(parse_license_info)

In [None]:
print(sampled_data.loc[0, 'file text'])

In [None]:
sampled_data['scan results'].value_counts()

In [None]:
sampled_data[['file path', 'response_parsed']]