In [None]:
!pip install pandas tqdm
!pip install sentence-transformers faiss-cpu
!pip install streamlit transformers sentence-transformers faiss-cpu
!pip install pyngrok

In [None]:
!pip freeze > requirements.txt

In [None]:
from google.colab import files
files.download("requirements.txt")

In [None]:
!pip install kaggle pandas tqdm

In [None]:
import pandas as pd
import json
from tqdm import tqdm
from google.colab import files

# Upload your Kaggle API key
print("üìÅ Please upload your kaggle.json API key file:")
files.upload()  # Upload kaggle.json

# Set up Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download arXiv dataset from Kaggle
!kaggle datasets download -d Cornell-University/arxiv
!unzip -o arxiv.zip

# Define path to the JSON file
json_path = "arxiv-metadata-oai-snapshot.json"

# Parse JSON line-by-line for performance
def load_json_lines(path, max_lines=None):
    data = []
    with open(path, 'r') as f:
        for i, line in enumerate(f):
            if max_lines and i >= max_lines:
                break
            data.append(json.loads(line))
    return data

# Load a limited number of lines for faster prototyping
print("‚è≥ Loading JSON file (this may take 2‚Äì5 minutes)...")
data = load_json_lines(json_path)
df = pd.DataFrame(data)

# Filter for Computer Science papers
df_cs = df[df['categories'].str.contains('cs.')].copy()
df_cs = df_cs[['id', 'title', 'abstract', 'categories', 'update_date']]

# Clean the abstract and title text
def clean_text(text):
    return text.replace('\n', ' ').replace('\r', ' ').strip() if isinstance(text, str) else ""

df_cs['title'] = df_cs['title'].apply(clean_text)
df_cs['abstract'] = df_cs['abstract'].apply(clean_text)

# Drop papers with empty abstracts
df_cs.dropna(subset=['abstract'], inplace=True)

# Save cleaned dataset
csv_path = "cs_arxiv_cleaned.csv"
df_cs.to_csv(csv_path, index=False)
print(f" Done! Saved {len(df_cs)} Computer Science papers to `{csv_path}`.")

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle

# Load data
df = pd.read_csv("cs_arxiv_cleaned.csv")
texts = (df['title'] + ": " + df['abstract']).tolist()
ids = df['id'].tolist()

# Embed with SBERT
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Save index and data
faiss.write_index(index, "cs_arxiv_index.faiss")
with open("cs_arxiv_ids.pkl", "wb") as f:
    pickle.dump(ids, f)
with open("cs_arxiv_texts.pkl", "wb") as f:
    pickle.dump(texts, f)

print("Saved FAISS index and associated metadata")

In [None]:
import streamlit as st
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Load vector store and metadata
index = faiss.read_index("cs_arxiv_index.faiss")
texts = pickle.load(open("cs_arxiv_texts.pkl", "rb"))
ids = pickle.load(open("cs_arxiv_ids.pkl", "rb"))

# Load embedder and summarizer
embedder = SentenceTransformer("all-MiniLM-L6-v2")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

st.set_page_config(page_title="üß† arXiv CS Chatbot", layout="wide")
st.title("üß† arXiv Computer Science Chatbot")

query = st.text_input("Enter your research question or paper topic")

if query:
    q_emb = embedder.encode([query])
    D, I = index.search(np.array(q_emb), 5)

    for idx in I[0]:
        st.subheader("üîç Related Paper")
        st.write(texts[idx])

        if st.button(f"Summarize Paper {ids[idx]}", key=str(idx)):
            summary = summarizer(texts[idx][:1024])[0]['summary_text']
            st.success("üìÑ Summary:")
            st.write(summary)

In [None]:
!pkill -f streamlit || echo "No Streamlit process to kill"
from pyngrok import ngrok
ngrok.kill()

NGROK_TOKEN = input("Enter your ngrok authtoken: ").strip()
os.environ["NGROK_AUTHTOKEN"] = NGROK_TOKEN

!ngrok config add-authtoken $NGROK_AUTHTOKEN

In [None]:
from pyngrok import ngrok
import threading
import os

public_url = ngrok.connect(8501)
print(f"üåê Streamlit app running at: {public_url}")

def run():
    os.system("streamlit run main.py")

thread = threading.Thread(target=run)
thread.start()
