In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install datasets bitsandbytes trl

In [None]:
!pip install datasets bitsandbytes trl transformers peft huggingface-hub accelerate safetensors pandas matplotlib

In [None]:
!pip install -U bitsandbytes

In [None]:
!pip install --upgrade transformers accelerate bitsandbytes

In [34]:
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [3]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True, 
                                         bnb_4bit_quant_type="nf4",  # "nf4" quantization type
                                         bnb_4bit_use_double_quant=True,  # Use double quantization
                                         bnb_4bit_compute_dtype=torch.float32  # Set compute dtype
                                        )

In [4]:
model_4bit = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-3B-Instruct",
    device_map="auto",
    quantization_config=quantization_config, 
    dtype="auto"
)


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [8]:
print(model_4bit.get_memory_footprint()/1e6)

2010.079488


In [10]:
model_4bit

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-35): 36 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm

In [11]:
model_4bit = prepare_model_for_kbit_training(model_4bit)

In [12]:
config = LoraConfig(
    # the rank of the adapter, the lower the fewer parameters you'll need to train
    r=8,
    lora_alpha=16, # multiplier, usually 2*r
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    # Newer models, such as Phi-3 at time of writing, may require
    # manually setting target modules
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj' 'gate_proj', 'up_proj', 'down_proj'],
    )

In [14]:
model_4bit = get_peft_model(model_4bit, config)

In [15]:
model_4bit

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 2048)
        (layers): ModuleList(
          (0-35): 36 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.L

In [17]:
print(model_4bit.get_memory_footprint()/1e6)

2673.000704


In [18]:
train_p, tot_p = model_4bit.get_nb_trainable_parameters()
print(f'Trainable parameters: {train_p/1e6:.2f}M')
print(f'Total parameters: {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

Trainable parameters: 10.03M
Total parameters: 3095.97M
% of trainable parameters: 0.32%


In [50]:
data_set = pd.read_parquet("/kaggle/input/telelogs-exp/telelogs_CoT.parquet")

In [51]:
data = data_set['q']
label = data_set['c']

In [52]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=1)

In [53]:
train_data = pd.concat([X_train, y_train], axis = 1)
test_data = pd.concat([X_test, y_test], axis = 1)

In [54]:
train_data = train_data.reset_index()
test_data = test_data.reset_index()

In [55]:
train_data = train_data.drop("index", axis = 1)
test_data = test_data.drop("index", axis = 1)

In [56]:
train_data, test_data

(                                                      q   c
 0     Analyze the 5G wireless network drive-test use...  C6
 1     Analyze the 5G wireless network drive-test use...  C5
 2     Analyze the 5G wireless network drive-test use...  C2
 3     Analyze the 5G wireless network drive-test use...  C7
 4     Analyze the 5G wireless network drive-test use...  C3
 ...                                                 ...  ..
 1915  Analyze the 5G wireless network drive-test use...  C1
 1916  Analyze the 5G wireless network drive-test use...  C3
 1917  Analyze the 5G wireless network drive-test use...  C2
 1918  Analyze the 5G wireless network drive-test use...  C7
 1919  Analyze the 5G wireless network drive-test use...  C1
 
 [1920 rows x 2 columns],
                                                      q   c
 0    Analyze the 5G wireless network drive-test use...  C5
 1    Analyze the 5G wireless network drive-test use...  C3
 2    Analyze the 5G wireless network drive-test use...  C3


In [68]:
train_data.head(2), test_data.head(2)

(                                                   q   c
 0  Analyze the 5G wireless network drive-test use...  C6
 1  Analyze the 5G wireless network drive-test use...  C5,
                                                    q   c
 0  Analyze the 5G wireless network drive-test use...  C5
 1  Analyze the 5G wireless network drive-test use...  C3)

In [70]:
train_dataset = 

q 0       Analyze the 5G wireless network drive-test use...
1       Analyze the 5G wireless network drive-test use...
2       Analyze the 5G wireless network drive-test use...
3       Analyze the 5G wireless network drive-test use...
4       Analyze the 5G wireless network drive-test use...
                              ...                        
1915    Analyze the 5G wireless network drive-test use...
1916    Analyze the 5G wireless network drive-test use...
1917    Analyze the 5G wireless network drive-test use...
1918    Analyze the 5G wireless network drive-test use...
1919    Analyze the 5G wireless network drive-test use...
Name: q, Length: 1920, dtype: object
c 0       C6
1       C5
2       C2
3       C7
4       C3
        ..
1915    C1
1916    C3
1917    C2
1918    C7
1919    C1
Name: c, Length: 1920, dtype: object
