## Dependences

In [2]:
!pip install --upgrade google-api-python-client
!pip install --upgrade bitsandbytes
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install tiktoken
!pip install torch torchvision torchaudio
!pip install scipy
!pip install statistics

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-a7mev45y
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-a7mev45y
  Resolved https://github.com/huggingface/transformers.git to commit 35c04596f8938370dd5a2930fb724781f8ea35b0
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-r

In [3]:
!nvidia-smi

Thu Jul 20 15:43:44 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3070        Off | 00000000:01:00.0  On |                  N/A |
|  0%   40C    P8              21W / 240W |    610MiB /  8192MiB |     37%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
!free

               total       utilisé      libre     partagé tamp/cache   disponible
Mem:        32688392     3854096    16985840       88832    11848456    28283276
Partition d'échange:    2097148           0     2097148


In [5]:
import torch
torch.cuda.is_available()

True

## Imports

In [6]:
import os
import torch
import time
import datetime
import traceback
import gc
import statistics

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

  from .autonotebook import tqdm as notebook_tqdm


## Loading the model

In [8]:
model_name = 'legendhasit/xgen-7b-8k-inst-8bit'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

Downloading (…)okenizer_config.json: 100%|██████████| 488/488 [00:00<00:00, 4.75MB/s]
Downloading (…)tokenization_xgen.py: 100%|██████████| 8.40k/8.40k [00:00<00:00, 24.4MB/s]
A new version of the following files was downloaded from https://huggingface.co/legendhasit/xgen-7b-8k-inst-8bit:
- tokenization_xgen.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading (…)cial_tokens_map.json: 100%|██████████| 148/148 [00:00<00:00, 2.04MB/s]
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Downloading (…)lve/main/config.json: 100%|██████████| 549/549 [00:00<00:00, 5.92MB/s]
Downloading (…)model.bin.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 89.0MB/s]
Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


## Inference

In [30]:
# Parameters:
# - DATASET_NAME
# - HEADER
# - PROMPT_TEMPLATE
# - samples in dataset 'input/<dataset_name>'
# - instructions in 'instructions.txt'

def mkdir(folder_path):
    try:
        os.mkdir(folder_path)
    except FileExistsError:
        pass

mkdir('input')

# Define prompt template
# ==========================================================================================
HEADER = (
    "A chat between a curious human and an artificial intelligence assistant. "
    "The assistant gives helpful, detailed, and polite answers to the human's questions."
)

PROMPT_TEMPLATE = HEADER + """

### Human: {instruction}

{article}

### Assistant:"""

# Load samples from dataset
# ==========================================================================================
DATASET_NAME = 'ami'
samples = os.listdir('input/' + DATASET_NAME + '/texts')
n_samples = len(samples)
print('Found', n_samples, 'samples')

# Load instructions
# ==========================================================================================
instruction_file = open('instructions.txt', 'r', encoding='utf-8')
instructions = instruction_file.readlines()
instruction_file.close()
n_instructions = len(instructions)
for i in range(n_instructions):
    instructions[i] = instructions[i].replace('\n', '')
print('Found', n_instructions, 'instructions')

# Determines the average CHARACTERS/TOKENS ratio to split big prompts
# This is because AMI and FredSum do not have the same average.
# ==========================================================================================
token_lengths = []
characters_to_tokens_ratios = []
for sample_n in range(n_samples):
    sample_file = open('input/' + DATASET_NAME + '/texts/' + samples[sample_n], 'r', encoding='utf-8')
    sample = sample_file.read()
    sample_file.close()
    sample_len = len(sample)

    tokens = tokenizer(sample, return_tensors="pt").to('cuda')
    tokens_len = len(tokens['input_ids'][0])
    print(sample_file, sample_len)
    token_lengths.append(tokens_len)
    ratio = round(sample_len / tokens_len, 2)
    characters_to_tokens_ratios.append(ratio)


print(token_lengths)

CHAR_TOKEN_RATIO = statistics.mean(characters_to_tokens_ratios)
MAX_SAMPLE_LENGTH = max(token_lengths)

del characters_to_tokens_ratios
del token_lengths

print('Average chars/tokens ratio:', CHAR_TOKEN_RATIO)
print('Average chars/tokens ratio:', MAX_SAMPLE_LENGTH)

Found 7 samples
Found 6 instructions
<_io.TextIOWrapper name='input/ami/texts/sample_4.txt' mode='r' encoding='utf-8'> 4392
<_io.TextIOWrapper name='input/ami/texts/sample_6.txt' mode='r' encoding='utf-8'> 36043
<_io.TextIOWrapper name='input/ami/texts/sample_3.txt' mode='r' encoding='utf-8'> 43275
<_io.TextIOWrapper name='input/ami/texts/sample_2.txt' mode='r' encoding='utf-8'> 38458
<_io.TextIOWrapper name='input/ami/texts/sample_7.txt' mode='r' encoding='utf-8'> 36043
<_io.TextIOWrapper name='input/ami/texts/sample_5.txt' mode='r' encoding='utf-8'> 38458
<_io.TextIOWrapper name='input/ami/texts/sample_1.txt' mode='r' encoding='utf-8'> 34013
[1271, 11302, 12870, 11683, 11302, 11683, 9243]
Average chars/tokens ratio: 3.3514285714285714
Average chars/tokens ratio: 12870


In [26]:
# This is taken to prevent 
MAX_TOKENS_IN_ONE_BATCH = 5000

def get_input_tokens(transcript):
    """This takes as input the transcript and splits it if two long into separate prompts.
    The number of 52 is chosen arbitrarily
    It is based on the average chars/tokens ratio of the selected dataset samples."""

    tokenized_length = len(tokenizer(transcript, return_tensors="pt").to('cuda')['input_ids'][0])
    print(tokenized_length)

sample_file = open('input/fredsum/texts/sample_1.txt', 'r', encoding='utf-8')
sample = sample_file.read()
sample_file.close()

print(get_input_tokens(sample))


6662
None


In [7]:
# Inference
# ==========================================================================================
initial_time = time.time()
skipped_samples = 0

mkdir('intermediate')
mkdir('intermediate/' + DATASET_NAME)

print('Starting computation...')

# For each instruction
for instruction_n in range(n_instructions):

    # Read instruction and create prompt
    instruction = instructions[instruction_n]
    
    # For each sample in dataset
    for sample_n in range(n_samples):

        # Estimate completion and time.
        cur_samples = instruction_n * n_samples + sample_n - skipped_samples
        tot_samples = n_instructions * n_samples - skipped_samples
        progress = cur_samples / tot_samples
        pct = round(progress * 100, 1)
        print('Prompting instruction N' + str(instruction_n + 1) + '/' + str(n_instructions) + ' on sample N' + str(sample_n + 1) + '/' + str(n_samples))
        print('-- Completion: ' + str(pct) + '%')
        if cur_samples > 0:
            approx_total = (time.time() - initial_time) / cur_samples * tot_samples
            approx_remaining = approx_total * (1 - progress)
            print('-- Estimated Remaining Time: ' + str(datetime.timedelta(seconds=int(approx_remaining))) + ' (total ' + str(datetime.timedelta(seconds=int(approx_total))) + ')')
        
        # Read sample and generate prompt
        sample_file_path = 'input/' + DATASET_NAME + '/texts/' + samples[sample_n]
        sample_file = open(sample_file_path, 'r', encoding='utf-8')
        sample = sample_file.read()
        sample_file.close()
        prompt = PROMPT_TEMPLATE.format(instruction=instruction, article=sample)
        
        # Find target file
        target_file_path = 'intermediate/' + DATASET_NAME + '/' + str(instruction_n + 1) + '_' + str(sample_n + 1) + '.txt'
        if os.path.isfile(target_file_path):
            print('-- Found intermediate result file \'' + target_file_path + '\', skipped.')
            skipped_samples += 1
            continue
            
        try:
        
            # Sample one answer
            input_ids = tokenizer(prompt, return_tensors="pt").to('cuda')
            sample = model.generate(**input_ids, do_sample=True, max_new_tokens=2048, top_k=100, eos_token_id=50256, temperature=0.3)
            output = tokenizer.decode(sample[0]).strip()

            # Save answer in file
            target_file = open(target_file_path, 'w', encoding='utf-8')
            target_file.write(output)
            target_file.close()

            del input_ids
            del sample
        
        except:
            print('Could not compute prompt:')
            print(prompt)
            traceback.print_exc()

        gc.collect()

delta = time.time() - initial_time
print('Done! Took', datetime.timedelta(seconds=int(delta)), 'seconds')

Starting computation...
Prompting instruction N1/6 on sample N1/10
-- Completion: 0.0%
-- Found intermediate result file 'intermediate/1_1.txt', skipped.
Prompting instruction N1/6 on sample N2/10
-- Completion: 0.0%
-- Found intermediate result file 'intermediate/1_2.txt', skipped.
Prompting instruction N1/6 on sample N3/10
-- Completion: 0.0%
-- Found intermediate result file 'intermediate/1_3.txt', skipped.
Prompting instruction N1/6 on sample N4/10
-- Completion: 0.0%
-- Found intermediate result file 'intermediate/1_4.txt', skipped.
Prompting instruction N1/6 on sample N5/10
-- Completion: 0.0%
-- Found intermediate result file 'intermediate/1_5.txt', skipped.
Prompting instruction N1/6 on sample N6/10
-- Completion: 0.0%
-- Found intermediate result file 'intermediate/1_6.txt', skipped.
Prompting instruction N1/6 on sample N7/10
-- Completion: 0.0%
-- Found intermediate result file 'intermediate/1_7.txt', skipped.
Prompting instruction N1/6 on sample N8/10
-- Completion: 0.0%
-- 

In [9]:
del input_ids
del sample
del output

## Calculs de scores

In [10]:
!pip install rouge_score rouge
!pip install evaluate
!pip install bert-score
!pip install sacrebleu
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting absl-py
  Downloading absl_py-1.4.0-py3-none-any.whl (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.5/126.5 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib
  Downloading joblib-1.3.1-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel fo

In [2]:
import evaluate
import os


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/linagora/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /home/linagora/anaconda3/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/linagora/.local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [4]:
# Method and variables
# ==========================================================================================
print('Starting scores computation...')
bleu = evaluate.load("bleu")
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

STORAGE_FILE_NAME = 'scores'
PREPROCESS_SUMMARIES = True

# Script itself
# ==========================================================================================

# Find output file for CSV scores
mkdir('output')
mkdir('output/' + DATASET_NAME)
storage_file = open('output/' + DATASET_NAME + '/' + STORAGE_FILE_NAME + '.csv', 'w', encoding='utf-8')
storage_file.write('path;rouge2;rougel;bertscore\n')

target_file_paths = []
references = []
predictions = []

# For each instruction
for instruction_n in range(n_instructions):

    # Read instruction and create prompt
    instruction = instructions[instruction_n]
    
    # For each sample in dataset
    for sample_n in range(n_samples):
        
        # Find target file
        target_file_path = 'intermediate/' + DATASET_NAME + '/' + str(instruction_n + 1) + '_' + str(sample_n + 1) + '.txt'
        if not os.path.isfile(target_file_path): # A MODIFIER : SI UN RESUME N'A PAS ETE GENERE
            print('-- Found no intermediate result file \'' + target_file_path + '\', skipped.')
            continue
        
        # Read sample and generate prompt -> Keep summary
        summary_file_path = 'input/' + DATASET_NAME + '/summaries/sample_' + str(sample_n + 1) + '.txt'
        summary_file = open(summary_file_path, 'r', encoding='utf-8')
        references.append(summary_file.read())
        summary_file.close()

        # Access generated summary
        target_file = open(target_file_path, 'r', encoding='utf-8')
        prediction = target_file.read()
        target_file.close()

        # Process answer
        if PREPROCESS_SUMMARIES:
            separator = "### Assistant:"
            prediction = prediction[prediction.index(separator) + len(separator):]
            if prediction[0] == " ": # Enlever l'espace devant
                prediction = prediction[1:]
            prediction = prediction[:-len("<|endoftext|>") - 2]
            #print(prediction)
            #print('---------------------------------------')
        
        # Add prediction
        target_file_paths.append(target_file_path)
        predictions.append(prediction)

# Calculate metrics
result_rouge = rouge.compute(predictions=predictions, references=references, use_aggregator=False)
#result_bleu = bleu.compute(predictions=predictions, references=references)
result_bertscore = bertscore.compute(predictions=predictions, references=references, lang='fr', rescale_with_baseline=True, verbose=True)

# Write to csv
# Forget about BLEU...
# Format: PATH | ROUGE2 | ROUGEL | BLEU | BERTScore
for i in range(len(target_file_paths)):
    ligne = target_file_paths[i]
    ligne += ';' + str(result_rouge['rouge2'][i]) + ";" + str(result_rouge['rougeL'][i])
    #ligne += ";" + str(result_bleu['bleu'])
    ligne += ";" + str(result_bertscore['f1'][i])

    storage_file.write(ligne + '\n')

storage_file.close()
print('Done!')

Starting scores computation...
-- Found no intermediate result file 'intermediate/ami/1_1.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/1_4.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/1_5.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/1_6.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/1_7.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/1_8.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/1_9.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/2_1.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/2_4.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/2_5.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/2_6.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/2_7.txt', skipped.
-- Found no intermediate result file 'intermediate/ami/2_8.txt', skipped.
-- Foun

  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 16421.56 seconds, 0.00 sentences/sec
Done!
