In [2]:
!pip -q install openpyxl pandas
import pandas as pd

excel_path = "/content/python_veri_bilimi_dataset_500_clean.xlsx"
df = pd.read_excel(excel_path)

df.head(), df.shape


(                                         instruction  input  \
 0     Why do we split data into train and test sets?    NaN   
 1  What is data leakage and how does it relate to...    NaN   
 2                              What is underfitting?    NaN   
 3               What are common data cleaning steps?    NaN   
 4           What is overfitting in machine learning?    NaN   
 
                                               output  
 0  We split data to evaluate how well a model gen...  
 1  We split data to evaluate how well a model gen...  
 2  Underfitting occurs when a model is too simple...  
 3  Common cleaning steps include handling missing...  
 4  Overfitting happens when a model learns the tr...  ,
 (500, 3))

In [3]:
csv_path = "/content/train.csv"
df.to_csv(csv_path, index=False, encoding="utf-8")
print("Kaydedildi:", csv_path)


Kaydedildi: /content/train.csv


In [4]:
import json

jsonl_path = "/content/train.jsonl"

with open(jsonl_path, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        rec = {
            "instruction": str(row["instruction"]),
            "input": "" if pd.isna(row["input"]) else str(row["input"]),
            "output": str(row["output"])
        }
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("JSONL hazır:", jsonl_path)


JSONL hazır: /content/train.jsonl


In [5]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files="/content/train.jsonl"
)["train"]

dataset


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 500
})

In [6]:
def format_example(ex):
    return {
        "text": f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"
    }

dataset = dataset.map(format_example)
dataset[0]["text"]


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

'### Instruction:\nWhy do we split data into train and test sets?\n\n### Response:\nWe split data to evaluate how well a model generalizes to unseen examples. Common ratios are 80/20 or 70/30, but it depends on dataset size. Avoid data leakage by fitting preprocessing (like scaling) only on the training set and applying it to the test set. A good next step is to visualize the metric across different hyperparameters.'

In [7]:
!pip -q install -U transformers peft accelerate bitsandbytes datasets

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
OUT_DIR = "/content/finetuned_lora"

# 1) Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2) Base model
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.float16
)
base_model.config.use_cache = False

# 3) LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(base_model, lora_config)
print("✅ LoRA eklendi")

# ✅ 4) Dataset'i tokenize et (dataset'te 'text' olmalı)
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset.column_names)

# labels = input_ids (CausalLM)
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]}, batched=True)

# 5) Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 6) Training args
training_args = TrainingArguments(
    output_dir="/content/out",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=2,
    fp16=True,
    logging_steps=10,
    save_strategy="no",
    report_to="none"
)

# 7) Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()

# 8) Kaydet (LoRA adapter dosyaları)
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

print("✅ Kaydedildi:", OUT_DIR)

# 9) Kontrol
import os
print("📁 OUT_DIR exists:", os.path.exists(OUT_DIR))
print("📄 Files:", os.listdir(OUT_DIR) if os.path.exists(OUT_DIR) else "NOT FOUND")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]



tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ LoRA eklendi


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Step,Training Loss
10,2.436953
20,2.093279
30,1.720792
40,1.496414
50,1.371725
60,1.218081
70,1.094703
80,0.978465
90,0.917741
100,0.796331


✅ Kaydedildi: /content/finetuned_lora
📁 OUT_DIR exists: True
📄 Files: ['tokenizer_config.json', 'adapter_config.json', 'README.md', 'tokenizer.json', 'chat_template.jinja', 'adapter_model.safetensors']


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
LORA_DIR = "/content/finetuned_lora"

tokenizer = AutoTokenizer.from_pretrained(LORA_DIR, use_fast=True)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    torch_dtype=torch.float16
)

model = PeftModel.from_pretrained(base_model, LORA_DIR)
model = model.merge_and_unload()
model.eval()

prompt = "### Instruction:\nWhat is overfitting?\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

out = model.generate(**inputs, max_new_tokens=80)
print(tokenizer.decode(out[0], skip_special_tokens=True))


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

### Instruction:
What is overfitting?

### Response:
Overfitting occurs when a model learns the training data too well, making predictions that are similar to the training labels. It can lead to inaccurate predictions and worse generalization. Underfitting occurs when a model learns too few features, making predictions that are too simple. It can lead to misinterpretations and worse accuracy. A good rule of thumb is to check the


In [9]:
!pip -q install gradio


In [10]:
import re
import torch

SYSTEM = """You are a Python Data Science Assistant.
Rules:
- Answer ONLY in English.
- Keep it short and clear: 3–6 sentences.
- If helpful, add 1 mini example.
- Interpret abbreviations in data-science context (CNN = Convolutional Neural Network).
- Avoid unrelated meanings (e.g., CNN the news channel).
"""

def build_prompt(question: str) -> str:
    return f"""### System:
{SYSTEM}

### Instruction:
{question.strip()}

### Response:
"""

def clean_output(text: str) -> str:
    # Keep only the Response part
    if "### Response:" in text:
        text = text.split("### Response:", 1)[1]

    # If the model starts a new section, cut it
    for stop in ["### Instruction:", "### System:"]:
        if stop in text:
            text = text.split(stop, 1)[0]

    # Remove markdown images and URLs
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'https?://\S+', '', text)

    # Clean extra whitespace
    text = re.sub(r"\n{3,}", "\n\n", text).strip()
    return text

def cevapla(soru, max_new_tokens=180, temperature=0.2, top_p=0.95):
    prompt = build_prompt(soru)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            do_sample=False,  # ✅ most stable outputs
            temperature=float(temperature),
            top_p=float(top_p),
            repetition_penalty=1.15,
            no_repeat_ngram_size=3,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    return clean_output(decoded)


In [12]:
import gradio as gr

css = """
/* overall */
.gradio-container{
  max-width: 980px !important;
  margin: 0 auto !important;
  padding: 28px 18px !important;
  font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial !important;
}

/* HERO HEADER */
#hero{
  text-align: center;
  padding: 28px 12px 30px 12px;
  margin-bottom: 22px;
}
#hero-title{
  font-size: 34px;
  font-weight: 800;
  letter-spacing: -0.6px;
  margin-bottom: 10px;
  background: linear-gradient(90deg, #8b5cf6, #22d3ee);
  -webkit-background-clip: text;
  -webkit-text-fill-color: transparent;
}
#hero-subtitle{
  font-size: 15px;
  color: #b8c1cc;
  max-width: 680px;
  margin: 0 auto;
  line-height: 1.6;
}

/* cards */
.block, .form{
  border-radius: 14px !important;
}
.gr-box{
  border-radius: 14px !important;
  border: 1px solid rgba(128,128,128,0.18) !important;
  box-shadow: 0 6px 24px rgba(0,0,0,0.06) !important;
}

/* buttons */
button.primary{
  border-radius: 12px !important;
  font-weight: 600 !important;
}
button.secondary{
  border-radius: 12px !important;
  font-weight: 600 !important;
}

/* textbox sizing */
textarea{
  border-radius: 12px !important;
}
"""

with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
    # ✅ gr.Markdown yerine gr.HTML kullan (sürüm uyumlu)
    gr.HTML(
        """
        <div id="hero">
            <div id="hero-title">Python Data Science Assistant</div>
            <div id="hero-subtitle">
                Ask a question and get a short, clear answer (3–6 sentences) focused on data science & machine learning concepts.
            </div>
        </div>
        """
    )

    with gr.Row(equal_height=True):
        with gr.Column(scale=5):
            question = gr.Textbox(
                label="Question",
                placeholder="Example: What is overfitting in machine learning?",
                lines=3
            )

            with gr.Accordion("Advanced generation settings", open=False):
                max_new_tokens = gr.Slider(32, 300, value=180, step=8, label="max_new_tokens")
                temperature   = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="temperature")
                top_p         = gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="top_p")

            with gr.Row():
                submit = gr.Button("Generate answer", variant="primary")
                clear  = gr.Button("Clear", variant="secondary")

            gr.Examples(
                examples=[
                    ["What is RMSE and when is it used?"],
                    ["Explain the difference between precision and recall."],
                    ["What is overfitting? How can I reduce it?"],
                    ["What is a Convolutional Neural Network (CNN) in ML?"],
                ],
                inputs=[question],
                label="Quick examples"
            )

        with gr.Column(scale=7):
            answer = gr.Textbox(
                label="Model Answer",
                lines=12,
                show_copy_button=True
            )

    submit.click(
        fn=cevapla,
        inputs=[question, max_new_tokens, temperature, top_p],
        outputs=[answer]
    )
    clear.click(
        fn=lambda: ("", 180, 0.2, 0.95, ""),
        inputs=None,
        outputs=[question, max_new_tokens, temperature, top_p, answer]
    )

demo.launch(share=True)


  with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
  with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ac5cdd73bebc0d6faa.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


