In [1]:
# Install package
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [3]:
from google.colab import files
files.upload()

Saving instruction_template.txt to instruction_template (1).txt
Saving json_fix_instruction.txt to json_fix_instruction.txt
Saving prompt_template.txt to prompt_template.txt


{'instruction_template (1).txt': b'You will be given a short description from an Airbnb listing in Edinburgh. It may describe either:\n- the property itself, or\n- the surrounding neighbourhood.\n\nYour task is to identify **places or locations described as being near** the property \xe2\x80\x94 for example, anything mentioned as a short walk away, close by, or nearby.\n\nYou should categorise all such places into three groups:\n\n1. `specific_locations` \xe2\x80\x94 locations that are **named and mappable**, such as landmarks, parks, venues, or neighbourhood features (e.g. "The Meadows", "Ocean Terminal").\n2. `general_locations` \xe2\x80\x94 vague or generic references to places that **are not named or are too broad to geocode**, such as "train station", "shops", or "city centre".\n3. `parent_locations` \xe2\x80\x94 the neighbourhood or area where the property is located (e.g. "Marchmont", "Leith"). Do not include these in the other two categories unless the location is explicitly de

In [4]:
def read_txt(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read().strip()

instruction_template = read_txt("/content/instruction_template.txt")
prompt_template      = read_txt("/content/prompt_template.txt")
json_fix_instruction = read_txt("/content/json_fix_instruction.txt")

print("Loaded chars:",
      len(instruction_template),
      len(prompt_template),
      len(json_fix_instruction))

Loaded chars: 3853 217 848


In [9]:
from google.colab import files
files.upload()

Saving 50Sample.csv to 50Sample.csv


{'50Sample.csv': b'id,latitude,longitude,last_scraped,description\r\n2822076,55.95595,-3.17179,2024/2/19,"Recently renovated 3 bedroom flat in a great location.<br />5 mins walk to Scottish Parliament, Holyrood Palace and the Royal Mile, 10 mins to Princes Street."\r\n3841834,55.94818,-3.20859,2024/2/19,Double bedroom with ensuite bathroom and smaller double bedroom  with large bathroom available to book in this luxury flat in the heart of Edinburgh. The flat also shares a large living room with an open plan kitchen.\r\n3976674,55.94692,-3.19896,2020/12/19,"A quiet and spacious Old Town apartment in the heart of the city.  The property has great views of the Castle from the bedroom, and is within walking distance of the touristic Grassmarket, the Castle and the Conference Centre (EICC). A wonderful home for a relaxing holiday break.<br /><br />- Conveniently located.<br />- Restaurants, shops and sights nearby.<br />- Fully-equipped kitchen.<br /><br /><b>The space</b><br />The apartme

In [13]:
import pandas as pd
df = pd.read_csv("/content/50Sample.csv", encoding="gbk")
df.head()

Unnamed: 0,id,latitude,longitude,last_scraped,description
0,2822076.0,55.95595,-3.17179,2024/2/19,Recently renovated 3 bedroom flat in a great l...
1,3841834.0,55.94818,-3.20859,2024/2/19,Double bedroom with ensuite bathroom and small...
2,3976674.0,55.94692,-3.19896,2020/12/19,A quiet and spacious Old Town apartment in the...
3,4703594.0,55.95908,-3.17312,2024/2/19,Lovely light and spacious Victorian flat in qu...
4,4789840.0,55.959831,-3.20931,2024/6/19,Saxe Coburg Place is renowned as being one of ...


In [14]:
df.columns
df.head(2)

Unnamed: 0,id,latitude,longitude,last_scraped,description
0,2822076.0,55.95595,-3.17179,2024/2/19,Recently renovated 3 bedroom flat in a great l...
1,3841834.0,55.94818,-3.20859,2024/2/19,Double bedroom with ensuite bathroom and small...


In [15]:
# Only retrieve the description column and convert it to a string to prevent null values
TEST_INPUTS = (
    df["description"]
    .dropna()
    .astype(str)
    .tolist()
)

print("Loaded samples:", len(TEST_INPUTS))
print("First sample:")
print(TEST_INPUTS[0])

Loaded samples: 50
First sample:
Recently renovated 3 bedroom flat in a great location.<br />5 mins walk to Scottish Parliament, Holyrood Palace and the Royal Mile, 10 mins to Princes Street.


In [16]:
import re

def clean_desc(s: str) -> str:
    s = re.sub(r"<br\s*/?>", "\n", s, flags=re.I)
    s = re.sub(r"\s+", " ", s).strip()
    return s

TEST_INPUTS = [clean_desc(x) for x in TEST_INPUTS]
print(TEST_INPUTS[0][:200])

Recently renovated 3 bedroom flat in a great location. 5 mins walk to Scottish Parliament, Holyrood Palace and the Royal Mile, 10 mins to Princes Street.


In [17]:
A_SCHEMA_REMINDER = """
You MUST output ONLY valid JSON.
Return a JSON list with exactly ONE object:
[{
  "specific_locations": [],
  "general_locations": [],
  "parent_locations": []
}]
All three fields must be JSON arrays (lists). If none, use empty lists.
No extra text outside the JSON.
"""

def build_main_prompt(description: str) -> str:
    instruction = instruction_template + "\n" + A_SCHEMA_REMINDER + "\nNow try with the following property description:\n" + description
    user_input = description
    response = ""
    return prompt_template.format(instruction, user_input, response)

print(build_main_prompt(TEST_INPUTS[0])[:600])

"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You will be given a short description from an Airbnb listing in Edinburgh. It may describe either:
- the property itself, or
- the surrounding neighbourhood.

Your task is to identify **places or locations described as being near** the property — for example, anything mentioned as a short walk away, close by, or nearby.

You should categorise all such places into three groups:

1. `specific_locations` — locations that


In [18]:
student_models = [
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
]

In [29]:
import unsloth
import json, torch
from unsloth import FastLanguageModel

REQUIRED_KEYS_A = {"specific_locations", "general_locations", "parent_locations"}

def try_extract_json(text: str):
    s_candidates = [i for i in (text.find("["), text.find("{")) if i != -1]
    if not s_candidates:
        return None
    s = min(s_candidates)
    e = max(text.rfind("]"), text.rfind("}"))
    if e <= s:
        return None
    cand = text[s:e+1]
    try:
        return json.loads(cand)
    except Exception:
        return None

def is_valid_A(obj):
    if not (isinstance(obj, list) and len(obj) == 1 and isinstance(obj[0], dict)):
        return False
    d = obj[0]
    if not REQUIRED_KEYS_A.issubset(d.keys()):
        return False
    return all(isinstance(d.get(k), list) for k in REQUIRED_KEYS_A)

@torch.no_grad()
def generate(model, tokenizer, prompt, max_new_tokens=256):
    toks = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2048
    ).to("cuda")

    out = model.generate(
        **toks,
        max_new_tokens=max_new_tokens,
        do_sample=False
    )

    gen_ids = out[0, toks["input_ids"].shape[-1]:]


    if gen_ids.numel() == 0:
        return ""

    return tokenizer.decode(gen_ids, skip_special_tokens=True)

@torch.no_grad()
def generate_chat(model, tokenizer, messages, max_new_tokens=256):
    enc = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt",
        return_attention_mask=True
    ).to("cuda")

    if isinstance(enc, dict):
        input_ids = enc["input_ids"]
        attention_mask = enc.get("attention_mask", None)
    else:
        input_ids = enc
        attention_mask = None

    out = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id,
    )

    gen_ids = out[0, input_ids.shape[-1]:]
    return "" if gen_ids.numel() == 0 else tokenizer.decode(gen_ids, skip_special_tokens=True)

A_FIX_INSTRUCTION = """
You are given a text that may contain location extraction results.
Convert it into VALID JSON with EXACTLY this structure:

Return a JSON list with ONE object:
[{
  "specific_locations": [],
  "general_locations": [],
  "parent_locations": []
}]

Rules:
- Output ONLY JSON, no explanation text.
- All three fields must exist.
- All values must be JSON arrays (lists).
- If a field is missing, use an empty list.
"""

def eval_model(model_name, descriptions, max_items=None):
    if max_items is not None:
        descriptions = descriptions[:max_items]

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    FastLanguageModel.for_inference(model)

    ok = 0
    rows = []

    for desc in descriptions:
        # -------------------------
        # Stage 1: Original Extraction
        # -------------------------
        messages_1 = [
          {"role": "system", "content": "You are a precise information extraction assistant. Follow the user's formatting requirements strictly."}
,
          {"role": "user", "content": build_main_prompt(desc)}
        ]
        raw1 = generate_chat(model, tokenizer, messages_1, max_new_tokens=256)

        # -------------------------
        # Stage 2: A-structure Fix
        # -------------------------
        messages_2 = [
          {"role": "system", "content": "You convert model outputs into strict JSON."},
          {"role": "user", "content": A_FIX_INSTRUCTION + "\n\nINPUT TEXT:\n" + raw1}
        ]
        raw2 = generate_chat(model, tokenizer, messages_2, max_new_tokens=256)

        obj = try_extract_json(raw2)
        valid = (obj is not None) and is_valid_A(obj)
        ok += int(valid)

        rows.append({
            "description": desc,
            "valid_A": valid,
            "stage1_raw": raw1,
            "stage2_fixed": raw2,   #Final output of structure A
            "parsed": obj,
        })

    del model, tokenizer
    torch.cuda.empty_cache()

    rate = ok / len(descriptions)
    return rate, rows


In [30]:
all_results = []
for m in student_models:
    print("\n" + "="*90)
    print("MODEL:", m)
    rate, rows = eval_model(m, TEST_INPUTS)
    print("A-structure valid rate:", rate)
    all_results.append({"model": m, "rate": rate, "rows": rows})

all_results


MODEL: unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
A-structure valid rate: 1.0

MODEL: unsloth/Qwen2.5-7B-Instruct-bnb-4bit
==((====))==  Unsloth 2026.1.4: Fast Qwen2 patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading b

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

A-structure valid rate: 1.0


[{'model': 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit',
  'rate': 1.0,
  'rows': [{'description': 'Recently renovated 3 bedroom flat in a great location. 5 mins walk to Scottish Parliament, Holyrood Palace and the Royal Mile, 10 mins to Princes Street.',
    'valid_A': True,
    'stage1_raw': '{\n  "specific_locations": ["Scottish Parliament", "Holyrood Palace", "The Royal Mile", "Princes Street"],\n  "general_locations": [],\n  "parent_locations": ["a great location"]\n}',
    'stage2_fixed': '[{\n  "specific_locations": ["Scottish Parliament", "Holyrood Palace", "The Royal Mile", "Princes Street"],\n  "general_locations": [],\n  "parent_locations": ["a great location"]\n}]',
    'parsed': [{'specific_locations': ['Scottish Parliament',
       'Holyrood Palace',
       'The Royal Mile',
       'Princes Street'],
      'general_locations': [],
      'parent_locations': ['a great location']}]},
   {'description': 'Double bedroom with ensuite bathroom and smaller double bedroom with la

In [31]:
import pandas as pd

summary_df = pd.DataFrame([
    {"model": r["model"], "A_valid_rate_after_fix": r["rate"]}
    for r in all_results
]).sort_values("A_valid_rate_after_fix", ascending=False)

summary_df

Unnamed: 0,model,A_valid_rate_after_fix
0,unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,1.0
2,unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit,1.0
1,unsloth/Qwen2.5-7B-Instruct-bnb-4bit,0.98


In [32]:
def avg_counts(rows):
    spec, gen, parent = [], [], []
    for row in rows:
        obj = row["parsed"]
        if not obj:
            continue
        d = obj[0]
        spec.append(len(d.get("specific_locations", [])))
        gen.append(len(d.get("general_locations", [])))
        parent.append(len(d.get("parent_locations", [])))
    return (
        sum(spec)/len(spec),
        sum(gen)/len(gen),
        sum(parent)/len(parent),
    )

quality_rows = []
for r in all_results:
    s, g, p = avg_counts(r["rows"])
    quality_rows.append({
        "model": r["model"],
        "avg_specific": s,
        "avg_general": g,
        "avg_parent": p,
    })

pd.DataFrame(quality_rows)

Unnamed: 0,model,avg_specific,avg_general,avg_parent
0,unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit,2.18,2.2,0.48
1,unsloth/Qwen2.5-7B-Instruct-bnb-4bit,2.081633,1.714286,0.591837
2,unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit,1.3,1.3,0.64


I further try to analyze the content quality of extracted locations by measuring the average number of locations per category. Llama-3.1-8B extracts the highest number of specific (2.18) and general (2.20) locations per listing, indicating stronger recall in fine-grained and regional location identification. Qwen-2.5-7B shows comparable performance but extracts fewer general locations on average (1.71). In contrast, Mistral-Nemo-12B is more conservative, producing significantly fewer specific and general locations, while assigning parent locations more frequently. These results suggest a trade-off between extraction coverage and conservativeness across student models.

Based on the 50 samples' test, Llama-3.1-8B provides the best balance between structural reliability and extraction coverage for our project.