In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import numpy as np
from scipy.stats import entropy
import json
import pandas as pd
import ast
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    device_map="auto"
)




In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print("✅ Model loaded successfully!")
print(f"Model device: {next(model.parameters()).device}")

PyTorch version: 2.9.0.dev20250801+cu129
CUDA available: True
✅ Model loaded successfully!
Model device: cuda:0


In [4]:
import pandas as pd
from transformers import AutoTokenizer

# 1. Load the dataset
csv_path = "C:\\Users\\s\\Desktop\\Dev\\SamsungProject\\extract\\code_dataset_10k.csv"
df = pd.read_csv(csv_path)

# 2. Initialize the tokenizer (use the same one your model uses)
#    Since you're using Qwen, we'll use its tokenizer
model_name = "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 3. Function to count tokens
def count_tokens(text):
    return len(tokenizer.encode(str(text)))

# 4. Add token count column for the 'code' field
df['code_token_length'] = df['code'].apply(count_tokens)

# 5. Show results
print("Token lengths for 'code':")
print(df[['seq_id', 'entry_point', 'code_token_length']].sort_values(by='code_token_length', ascending=False))

# 6. Get maximum token length
max_tokens = df['code_token_length'].max()
print(f"\n🎯 Maximum token length in 'code': {max_tokens}")

# Optional: Get some statistics
print(f"\n📊 Statistics:")
print(df['code_token_length'].describe())

Token lengths for 'code':
           seq_id              entry_point  code_token_length
3250  38901396701  next_closest_palindrome                681
8674  30074294089          transform_words                504
8373  14120506416             ideal_arrays                496
5809   1185920464     is_perfect_rectangle                487
2788   7804968646          is_valid_number                479
...           ...                      ...                ...
6914   1516364756          smallest_number                 10
8847  31357937629                 list_sum                 10
2173  24802847166                 find_max                 10
8978  14699557128          smallest_number                 10
3932  41065408432          smallest_number                 10

[10000 rows x 3 columns]

🎯 Maximum token length in 'code': 681

📊 Statistics:
count    10000.000000
mean        80.683000
std         56.665416
min         10.000000
25%         42.000000
50%         67.000000
75%        102.000

In [5]:
import pandas as pd
from transformers import AutoTokenizer

# 1. Load the dataset
csv_path = "C:\\Users\\s\\Desktop\\Dev\\SamsungProject\\extract\\code_dataset_10k.csv"
df = pd.read_csv(csv_path)

# 2. Initialize the Qwen tokenizer
model_name = "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 3. Function to count tokens
def count_tokens(text):
    return len(tokenizer.encode(str(text)))

# 4. Add token count column for the 'code' field
df['code_token_length'] = df['code'].apply(count_tokens)

# 5. Sort by token length in descending order and show top 30
top_30_by_tokens = df[['seq_id', 'entry_point', 'code', 'code_token_length']] \
    .sort_values(by='code_token_length', ascending=False) \
    .reset_index(drop=True)

# 6. Display results
print("Top 30 entries by code token length (sorted descending):")
print(top_30_by_tokens)

# 7. Show the maximum token count
max_tokens = top_30_by_tokens['code_token_length'].max()
print(f"\n🎯 Maximum token length in 'code': {max_tokens}")

# Optional: Save this ranked list to a new CSV
top_30_by_tokens.to_csv("top_30_by_code_token_length.csv", index=False)
print(f"\n✅ Saved sorted results to 'top_30_by_code_token_length.csv'")

Top 30 entries by code token length (sorted descending):
           seq_id              entry_point  \
0     38901396701  next_closest_palindrome   
1     30074294089          transform_words   
2     14120506416             ideal_arrays   
3      1185920464     is_perfect_rectangle   
4      7804968646          is_valid_number   
...           ...                      ...   
9995   1516364756          smallest_number   
9996  31357937629                 list_sum   
9997  24802847166                 find_max   
9998  14699557128          smallest_number   
9999  41065408432          smallest_number   

                                                   code  code_token_length  
0     from functools import reduce\n\ndef next_close...                681  
1     from collections import defaultdict\n\ndef tra...                504  
2     from collections import Counter\nfrom functool...                496  
3     from typing import List\n\nclass Solution:\n  ...                487  
4    