In [None]:
import pandas as pd
from helpers.functions import *
from helpers.models import Models
from helpers.llm_client import LLMClient
from helpers.parsers import license_parser
from helpers.functions import *
import pandas as pd
pd.set_option('display.max_rows', None)    # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full column width

In [None]:
df0 = pd.read_csv('extras/linux-master.csv', index_col=0)
df1 = pd.read_csv('extras/pytorch-main.csv')
df2 = pd.concat([df0, df1])
df = sample_by_label_limit(df2, 'scan results', 10, 42)
df = df.drop(columns=['concluded results'])
df['scan results'].value_counts()

In [None]:
df.head(5)

In [None]:
df['license_relevant_lines'] = [[] for _ in range(len(df))]

In [None]:
df.head(5)

In [None]:
create_license_dataset('extras/license_information/details')
client = LLMClient()
df = client.temp_function(df)
df.head(5)

In [None]:
df.to_csv('extras/dataset.csv')

In [None]:
file_idx = 13
results = get_top_similar_license_lines(\
                df.loc[file_idx, 'file_comments'],
                'extras/license_information/license_dataset.txt',
                # model='bow',
                top_k=10,
                embedding_approach='license-embedding'
            )
results

In [None]:
df.loc[119]

In [None]:
def prompt_function(text):
    return f"""
    [Task]
        - Given a list of tuples representing lines of text (with some metadata), identify and return only the lines that are relevant to software licenses.
        - 

    [Guidelines]
        1. **License Relevance:** A line is considered relevant if it contains keywords typically found in software licenses (e.g., "Copyright," "License," "SPDX-License-Identifier," common license names like "MIT," "GPL," "Apache"). 
        2. **Output Format:** Return a list containing only the license-relevant lines of text. If multiple relevant lines exist, include them all in the list. If no relevant lines are found, return an empty list: `[]`.
        3. DO NOT provide an explanation. Return the output only
        4. DO NOT make up licenses or assume any information. Return the lines as they are in the input
        
    [Example Input]
        [(1, 0.51916325, 'Copyright 2015, Anton Blanchard, IBM Corp.'),
        (0, 0.50058347, 'SPDX-License-Identifier: GPL-2.0-only')]
    
    [Example Output]
        ['SPDX-License-Identifier: GPL-2.0-only']

    [Additional Notes]
        - The `line_number` and `score` are provided for context, but you should focus solely on the `text` to determine license relevance.
        - If unsure about a line's relevance, err on the side of including it in the output. 

    [Text]
    {text}
    """

In [None]:
llama_3_results = client.process_dataset(df.loc[10::], df_path='pytorch-main.csv',
                                    model=Models.LLAMA_3_8b,
                                    prompt_function=prompt_function,
                                    parser=license_parser,
                                    extra_file_path='extras',
                                    log_every=5,
                                )

In [None]:
llama_3_results.loc[71:80, 'response']

In [None]:
print(llama_3_results.loc[75, 'file_comments'])

In [None]:
llama_3_results.loc[75, 'response'] = "['Copyright 2015-2016 Collabora Ltd.**  Based on the implementation from the Android Open Source Project,**  Copyright 2012 Google, Inc**  Permission is hereby granted, free of charge, to any person obtaining a*  copy of this software and associated documentation files (the \"Software\"),*  to deal in the Software without restriction, including without limitation*  the rights to use, copy, modify, merge, publish, distribute, sublicense,*  and/or sell copies of the Software, and to permit persons to whom the*  Software is furnished to do so, subject to the following conditions:**  The above copyright notice and this permission notice shall be included in*  all copies or substantial portions of the Software.**  THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR*  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,*  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL*  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR*  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,*  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR*  OTHER DEALINGS IN THE SOFTWARE.']"

In [None]:
llama_3_results.to_csv('extras/dataset-labeled-1-75.csv')

In [None]:
print(llama_3_results.loc[21, 'response'])

In [None]:
llama_3_results.loc[20, 'response'] = "['Released under the terms of the GNU GPL']"

In [None]:
embeddings_file_path = '/home/jimbo/Desktop/GSoC24/repo/GSoC24/extras/license_information/license_embeddings/768_SentenceTransformer-license-embedding.pkl'
if os.path.exists(embeddings_file_path):
    print(f"Loading pre-embedded licenses from: {embeddings_file_path}")
    with open(embeddings_file_path, "rb") as fIn:
        stored_data = pickle.load(fIn)
        license_embeddings = stored_data['embeddings']
embedding_model = SentenceTransformer("all-mpnet-base-v2")
file_idx = 76
results = get_top_similar_license_lines(\
                df.loc[file_idx, 'file_comments'],
                'extras/license_information/license_dataset.txt',
                license_embeddings,
                embedding_model,
                top_k=10,
                embedding_approach='license-embedding'
            )
results

In [None]:
import nirjas
comments = nirjas.extract(os.path.join('extras', df.loc[file_idx, 'file path']))
comments