<a href="https://colab.research.google.com/github/MARC27-Internet-Private-Limited/MXene-LLM/blob/main/MXene.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<div style="text-align: center;">
  <img src="https://research.marc27.com/_app/immutable/assets/logo_marc27.B__kGcan.svg"
       alt="Our logo" width="50">
  <h1>P.R.I.S.M.</h1>
  <h3><u>P</u>latform for <u>R</u>esearch in <u>I</u>ntelligent <u>S</u>ynthesis of <u>M</u>Xenes</h3>
</div>




---


Welcome to the **P.R.I.S.M.** notebook to our MXene research project. This notebook sets up a data pipeline that:

- **Scrapes and Aggregates Research Data:** Automatically fetches relevant academic articles and patents related to MXene synthesis.
- **Trains a Small LLM:** Uses supervised fine-tuning (SFT) and reinforcement learning from human feedback (RLHF) to develop a model that learns to identify the best data. Here we're using: ```deepseek-ai/DeepSeek-R1-Distill-Qwen-7B```


- **Populates a Structured Database:** Organizes the curated data for downstream use in training advanced AI models for material discovery.

Running on ```Google Colab```, this notebook bridges raw research with AI-driven insights in a smarter way.


---



---

To keep the platform hardware agnostic, we'll be using our github repo for the project:
```https://github.com/MARC27-Internet-Private-Limited/MXene-LLM.git```

Please generate a Personal Access Token on Github to access write privileges to the repositories files

---

In [None]:
import os
import sys
import subprocess
from getpass import getpass

# Setup logging
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Install Git if not present
if 'kaggle' in sys.modules:
    !pip install gitpython --quiet
else:
    try:
        import git
    except ImportError:
        !pip install gitpython --quiet

# Network check
def check_network():
    try:
        subprocess.run(["ping", "-c", "4", "github.com"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        logger.info("Network check: GitHub reachable")
        return True
    except subprocess.CalledProcessError:
        logger.error("Network check: GitHub not reachable")
        return False

# GitHub setup
repo_url = "https://github.com/MARC27-Internet-Private-Limited/MXene-LLM.git"
repo_dir = os.path.join(os.getcwd(), "mxene_project")
if not os.path.exists(repo_dir):
    if 'kaggle' in sys.modules:
        from kaggle_secrets import UserSecretsClient
        token = UserSecretsClient().get_secret("GITHUB_TOKEN") or getpass("Enter GitHub token: ")
    else:
        token = os.environ.get('GITHUB_TOKEN') or getpass("Enter GitHub token: ")

    logger.info("GitHub token provided" if token else "No GitHub token")
    auth_url = f"https://{token}:x-oauth-basic@github.com/MARC27-Internet-Private-Limited/MXene-LLM.git"

    if check_network():
        from git import Repo
        try:
            Repo.clone_from(auth_url, repo_dir, progress=None, verbose=True)
            logger.info(f"Cloned repo to {repo_dir}")
        except git.GitCommandError as e:
            logger.error(f"Clone failed: {e}")
            raise
    else:
        logger.error("Skipping clone—network unavailable")
        raise Exception("Network failure—retry later")
else:
    logger.info(f"Repo already exists at {repo_dir}")

os.chdir(repo_dir)
print(f"Working in {repo_dir}")

# GPU check
import torch
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

KeyboardInterrupt: Interrupted by user

In [None]:
packages = "transformers datasets accelerate trl bitsandbytes pandas peft gradio gitpython tensorboard"
try:
    import transformers, datasets, trl, bitsandbytes, pandas, peft, gradio, git, torch
except ImportError:
    !pip install {packages} --quiet

Scraping arXiv...
arXiv scraped: 30 abstracts
Scraping Materials Project...


Retrieving SummaryDoc documents:   0%|          | 0/33 [00:00<?, ?it/s]

Materials Project scraped: 30 abstracts
Scraping PubMed...
PubMed scraped: 22 abstracts
Scraped abstracts - MXene/MAX dataset ready


In [None]:
import logging
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, PPOTrainer, PPOConfig
import pandas as pd
import gradio as gr
from git import Repo
import torch

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [None]:
logger.info("Loading model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype="float16") if torch.cuda.is_available() else None
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    quantization_config=bnb_config if bnb_config else None,
    device_map="auto" if torch.cuda.is_available() else None
)
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
logger.info(f"Model loaded on {device}")

In [None]:
logger.info("Loading abstracts...")
abstracts_df = pd.read_csv('abstracts.csv')
logger.info(f"Loaded {len(abstracts_df)} abstracts from abstracts.csv")

In [None]:
logger.info("Preparing initial training data...")
data = [
    {"input": "Ti₃AlC₂ synthesized with HF etching, a=3.07Å, c=18.5Å.", "output": '{"category": "properties", "composition": "Ti3AlC2", "lattice_a": "3.07Å", "lattice_c": "18.5Å", "etchant": "HF"}'},
    {"input": "Cr₂GaN manufacturing via sputtering.", "output": '{"category": "manufacturing", "composition": "Cr2GaN"}'},
    {"input": "Nb₄C₃Tₓ tested for supercapacitors.", "output": '{"category": "testing", "mxene": "Nb4C3Tx"}'},
    {"input": "Ti₃C₂Tₓ tested for supercapacitors.", "output": '{"category": "testing", "mxene": "Ti3C2Tx"}'},
    {"input": "Ti₂AlC - lattice a=3.057Å", "output": '{"category": "properties", "composition": "Ti2AlC", "lattice_a": "3.057Å"}'}
]
df = pd.DataFrame(data)
df.to_csv('finetunedata.csv', index=False)
logger.info("Initial training data saved - 5 entries")

In [None]:
logger.info("Setting up LoRA adapters...")
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
logger.info("LoRA adapters added")

In [None]:
logger.info("Starting SFT...")
dataset = load_dataset('csv', data_files='finetunedata.csv')
def tokenize(examples):
    logger.info("Tokenizing batch...")
    inputs = tokenizer(examples["input"], padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = outputs["input_ids"]
    return inputs
tokenized_dataset = dataset.map(tokenize, batched=True)

training_args = TrainingArguments(
    output_dir='model_sft',
    per_device_train_batch_size=1,
    num_train_epochs=3,
    save_steps=50,
    logging_steps=5,
    fp16=torch.cuda.is_available(),
    report_to="tensorboard",
    logging_dir='runs'
)
trainer = SFTTrainer(model=model, args=training_args, train_dataset=tokenized_dataset["train"])
trainer.train()
logger.info("Initial SFT complete")

In [None]:
logger.info("Saving model and logs...")
model.save_pretrained('model_sft_final')
tokenizer.save_pretrained('model_sft_final')

repo = Repo('.')
repo.git.add(all=True)
repo.git.commit(m="Saved initial SFT model and TensorBoard logs")
repo.git.push()
logger.info("Model and logs pushed to GitHub")

In [None]:
def process_and_refine(input_text, user_output):
    logger.info(f"Input received: {input_text}")
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    logger.info("Tokenization complete")
    try:
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            pad_token_id=tokenizer.eos_token_id,
            num_return_sequences=1,
            timeout=5
        )
        predicted = tokenizer.decode(outputs[0], skip_special_tokens=True)
        logger.info(f"Prediction generated: {predicted}")
    except Exception as e:
        predicted = f"Error: {str(e)}"
        logger.error(f"Generation failed: {e}")

    if user_output:
        logger.info(f"Adding feedback: {user_output}")
        new_data = pd.DataFrame([{"input": input_text, "output": user_output}])
        existing_data = pd.read_csv('finetunedata.csv')
        updated_data = pd.concat([existing_data, new_data], ignore_index=True)
        updated_data.to_csv('finetunedata.csv', index=False)
        repo.git.add('finetunedata.csv')
        repo.git.commit(m=f"Added feedback - {len(updated_data)} entries")
        repo.git.push()
        logger.info(f"Feedback saved and pushed: {len(updated_data)} entries")
        return predicted, f"Feedback added - {len(updated_data)} entries"
    return predicted, "Prediction only"

logger.info("Launching Gradio UI...")
interface = gr.Interface(
    fn=process_and_refine,
    inputs=[gr.Textbox(lines=2, placeholder="Enter abstract (e.g., Ti₃AlC₂ synthesized, a=3.07Å)..."),
            gr.Textbox(lines=2, placeholder="Correct output (e.g., {'category': 'properties', ...}) or blank")],
    outputs=[gr.Textbox(label="Prediction"), gr.Textbox(label="Status")],
    title="MXene/MAX Initial Classifier",
    description="Test predictions, add feedback—grows finetunedata.csv in GitHub."
)
interface.launch(share=True, server_name="0.0.0.0" if 'kaggle' not in sys.modules else None)

In [None]:
# logger.info("Starting RL (stub - run later)...")
# ppo_config = PPOConfig(model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", learning_rate=1e-5, batch_size=1)
# ppo_trainer = PPOTrainer(model=model, config=ppo_config, tokenizer=tokenizer, dataset=tokenized_dataset["train"])
#
# def reward_func(pred, true):
#     return 1.0 if pred == true else -0.5
#
# for epoch in range(5):
#     for batch in tokenized_dataset["train"]:
#         inputs = tokenizer(batch["input"], return_tensors="pt").to(device)
#         outputs = model.generate(**inputs, max_new_tokens=50)
#         pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         reward = reward_func(pred, batch["output"])
#         ppo_trainer.step([inputs["input_ids"][0]], [outputs[0]], [reward])
# logger.info("RL stub complete - expand in Notebook 3")

In [None]:
# logger.info("Merging with MXene-DB (stub - run in Notebook 2)...")
# mxene_db_url = "https://raw.githubusercontent.com/diegonti/MXene-DB/main/mxene_db.csv"
# mxene_db = pd.read_csv(mxene_db_url)
# mxene_subset = mxene_db[['full_name', 'a', 'Eg_PBE']].rename(columns={'full_name': 'composition', 'a': 'lattice_a', 'Eg_PBE': 'bandgap'})
# abstracts_df = pd.read_csv('abstracts.csv')
# combined_df = pd.concat([abstracts_df, mxene_subset], ignore_index=True)
# combined_df.to_csv('combined_data.csv', index=False)
# repo.git.add('combined_data.csv')
# repo.git.commit(m="Merged with MXene-DB")
# repo.git.push()
# logger.info("MXene-DB merged - stub complete")