<a href="https://colab.research.google.com/github/KeerSG/Airbnb-Performace-Pridiction-using-LLM-derived-Spatial-Semantics/blob/main/SGKE_Test2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [4]:
import json
import os
import torch
import pandas as pd
from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

In [5]:
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Test different Model
# "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit"
# "unsloth/Qwen2.5-7B-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.34. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2026.1.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
# Define Instruction Template (from Instruction_template. txt)
system_instruction = """You will be given a short description from an Airbnb listing in Edinburgh.
It may describe either:
- the property itself, or
- the surrounding neighbourhood.
Your task is to identify **places or locations described as being near** the property — for example, anything mentioned as a short walk away, close by, or nearby.
You should categorise all such places into three groups:

1. `specific_locations` — locations that are **named and mappable**, such as landmarks, parks, venues, or neighbourhood features (e.g. "The Meadows", "Ocean Terminal").
2. `general_locations` — vague or generic references to places that **are not named or are too broad to geocode**, such as "train station", "shops", or "city centre".
3. `parent_locations` — the neighbourhood or area where the property is located (e.g. "Marchmont", "Leith").
Do not include these in the other two categories unless the location is explicitly described as a separate nearby place.
Correct minor spelling mistakes for named places (e.g. "Murayfield" -> "Murrayfield Stadium") so they match real map locations.
For `general_locations`, strip out descriptive words and articles (e.g. "the lively bars" -> "bars", "local shops" -> "shops")..

If a vague reference clearly refers to a specific place, correct it (e.g. "the castle" -> "Edinburgh Castle").
But only do this when the reference is **unambiguous**. For example, "the station" may refer to multiple locations and should remain general unless disambiguated in the text.
If a location is mentioned more than once, list it only once in the appropriate category.
Return your output in the following JSON format, marked "<|startofjson|>" and "<|endofjson|>" before and after the JSON block:"""

In [37]:
# Define Prompt format (from prompt_template. txt)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [25]:
# Read and process data
data = []
csv_filename = '50Sample.csv'

if os.path.exists(csv_filename):
    print(f"✅ Found file: {csv_filename}. Reading...")

✅ Found file: 50Sample.csv. Reading...


In [28]:
# --- Smart Encoding Attempt ---
try:
    # First, try standard UTF-8
    df = pd.read_csv(csv_filename, encoding='utf-8')
    print("-> Successfully read with UTF-8 encoding.")
except UnicodeDecodeError:
    print("-> ⚠️ UTF-8 failed. Trying 'latin1' (ISO-8859-1) encoding...")
    try:
        # If UTF-8 fails, try Latin1 (common for Excel/Windows files)
        df = pd.read_csv(csv_filename, encoding='latin1')
        print("-> Successfully read with Latin1 encoding.")
    except Exception as e:
        print(f"-> ❌ All encoding attempts failed. Error: {e}")
        df = pd.DataFrame()  # Create empty DF to avoid crash

-> ⚠️ UTF-8 failed. Trying 'latin1' (ISO-8859-1) encoding...
-> Successfully read with Latin1 encoding.


In [30]:
# 先判断 df 是否存在且有效
if 'df' in locals() and not df.empty:
# Limit to 50 rows (as requested)
    df = df.head(50)

    # Check for the correct column name.
    input_col = 'description'
    if input_col not in df.columns:
        # Fallback: try to find a column that looks like input
        possible_cols = [col for col in df.columns if 'desc' in col.lower() or 'input' in col.lower()]
        if possible_cols:
            input_col = possible_cols[0]
            print(f"Warning: 'description' column not found. Using '{input_col}' instead.")

    # Convert DataFrame to list of dicts for processing
    data_to_process = df.to_dict('records')
    print(f"Loaded {len(data_to_process)} rows for processing.")

else:
    print(f"ERROR: {csv_filename} not found! Please upload the file.")
    data_to_process = []

Loaded 50 rows for processing.


In [None]:
# RUN INFERENCE (EXTRACTION)
import json
from tqdm import tqdm

results = []

print("Starting extraction... this may take a few minutes.")

for i, row in tqdm(enumerate(data_to_process), total=len(data_to_process)):

    # Get the description text
    input_text = str(row.get('description', ''))

    # 1. Format the prompt
    formatted_prompt = alpaca_prompt.format(
        instruction=system_instruction,
        input_text=input_text
    )

    # 2. Tokenize
    inputs = tokenizer([formatted_prompt], return_tensors = "pt").to("cuda")

    # 3. Generate Response
    outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
    decoded_output = tokenizer.batch_decode(outputs)[0]

    # 4. Extract the "Response" part
    full_response = decoded_output.split("### Response:")[-1].strip()

    # 5. Save the result
    results.append({
        "row_id": i,
        "input_description": input_text,
        "raw_model_output": full_response
    })

In [None]:
# 5. SAVE RESULTS
# ==========================================
output_filename = "extraction_results.json"

with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nProcessing Complete! Results saved to '{output_filename}'.")
print("-" * 30)
print("PREVIEW OF THE FIRST RESULT:")
if len(results) > 0:
    print(results[0]['raw_model_output'])