In [25]:
!pip install -q langchain langchain-community
!pip install -q torch transformers accelerate bitsandbytes sentence-transformers unstructured[all-docs] langchain chromadb langchain_community --use-deprecated=legacy-resolver
# citations https://huggingface.co/learn/cookbook/en/rag_zephyr_langchain, https://huggingface.co/learn/cookbook/en/rag_with_unstructured_data

Dataset Values Meaning: 


    kepid: Kepler ID of the star, a unique identifier assigned by the Kepler mission.

    kepoi_name: Kepler Object of Interest (KOI) name. This is the identifier for an exoplanet candidate observed by Kepler.

    kepler_name: Name of the confirmed exoplanet, if the candidate has been confirmed. If not confirmed, this may be blank.

    koi_disposition: Final disposition of the KOI. Possible values:
        CONFIRMED: Confirmed exoplanet.
        CANDIDATE: Potential exoplanet, not yet confirmed.
        FALSE POSITIVE: Identified as a false positive and not an exoplanet.

    koi_pdisposition: Pipeline disposition, which is the preliminary disposition assigned by the Kepler pipeline.

    koi_score: Score representing the probability that the KOI is a real planet candidate, ranging from 0 to 1.

    koi_fpflag_nt: Flag for not-transit-like false positive. If 1, the object is likely a non-transit false positive.

    koi_fpflag_ss: Flag for stellar eclipse false positive. If 1, it indicates the signal is likely due to a nearby star eclipsing binary.

    koi_fpflag_cs: Flag for centroid offset false positive. If 1, it suggests the detected signal is offset from the target star.
    
    koi_fpflag_ec: Flag for ephemeris match with a known eclipsing binary, indicating the KOI signal may match that of a known binary.

    koi_steff_err2: Lower error bound for the stellar effective temperature (koi_steff), indicating uncertainty in the temperature value.
    
    koi_slogg: Stellar surface gravity, given in cm/s², describing the gravitational force on the star’s surface.
    
    koi_slogg_err1: Upper error bound for stellar surface gravity.
    koi_slogg_err2: Lower error bound for stellar surface gravity.
    koi_srad: Stellar radius, the radius of the host star in solar radii.
    koi_srad_err1: Upper error bound for stellar radius.
    koi_srad_err2: Lower error bound for stellar radius.
    ra: Right Ascension of the star in degrees, indicating its position on the celestial sphere.

    dec: Declination of the star in degrees, representing the position on the celestial sphere relative to the celestial equator.

    koi_kepmag: Kepler-band magnitude of the star, which is the brightness of the star as observed by the Kepler telescope.

Using this sample dataset, we can attempt to make a RAG that takes this data on top of the Hugging Face Langchain model or other LLM models (so far I am just trying the HF one) to be able to answer questions based on a prompt from a user, such as a question. 

In [26]:
# nasa dataset from https://www.kaggle.com/datasets/arashnic/exoplanets?resource=download to be put on top of the rag
import pandas as pd
import numpy as np

#load dataset
# Dataset is a NASA dataset on exoplanets, planets outside of solar system
df = pd.read_csv('exoplanets.csv')
df

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.000,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,10090151,K07985.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,1,0,...,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
9560,10128825,K07986.01,,CANDIDATE,CANDIDATE,0.497,0,0,0,0,...,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
9561,10147276,K07987.01,,FALSE POSITIVE,FALSE POSITIVE,0.021,0,0,1,0,...,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385
9562,10155286,K07988.01,,CANDIDATE,CANDIDATE,0.092,0,0,0,0,...,-128.0,2.992,0.030,-0.027,7.824,0.223,-1.896,296.76288,47.145142,10.998


In [27]:
df.isnull().sum()

kepid                   0
kepoi_name              0
kepler_name          7205
koi_disposition         0
koi_pdisposition        0
koi_score            1510
koi_fpflag_nt           0
koi_fpflag_ss           0
koi_fpflag_co           0
koi_fpflag_ec           0
koi_period              0
koi_period_err1       454
koi_period_err2       454
koi_time0bk             0
koi_time0bk_err1      454
koi_time0bk_err2      454
koi_impact            363
koi_impact_err1       454
koi_impact_err2       454
koi_duration            0
koi_duration_err1     454
koi_duration_err2     454
koi_depth             363
koi_depth_err1        454
koi_depth_err2        454
koi_prad              363
koi_prad_err1         363
koi_prad_err2         363
koi_teq               363
koi_teq_err1         9564
koi_teq_err2         9564
koi_insol             321
koi_insol_err1        321
koi_insol_err2        321
koi_model_snr         363
koi_tce_plnt_num      346
koi_tce_delivname     346
koi_steff             363
koi_steff_er

In [44]:
# drop null or not necessary categories, thhese are null for most of the values
#df = df.drop(columns=["koi_teq_err1", "koi_teq_err2"])
list(df.columns)

['kepid',
 'kepoi_name',
 'kepler_name',
 'koi_disposition',
 'koi_pdisposition',
 'koi_score',
 'koi_fpflag_nt',
 'koi_fpflag_ss',
 'koi_fpflag_co',
 'koi_fpflag_ec',
 'koi_period',
 'koi_period_err1',
 'koi_period_err2',
 'koi_time0bk',
 'koi_time0bk_err1',
 'koi_time0bk_err2',
 'koi_impact',
 'koi_impact_err1',
 'koi_impact_err2',
 'koi_duration',
 'koi_duration_err1',
 'koi_duration_err2',
 'koi_depth',
 'koi_depth_err1',
 'koi_depth_err2',
 'koi_prad',
 'koi_prad_err1',
 'koi_prad_err2',
 'koi_teq',
 'koi_insol',
 'koi_insol_err1',
 'koi_insol_err2',
 'koi_model_snr',
 'koi_tce_plnt_num',
 'koi_tce_delivname',
 'koi_steff',
 'koi_steff_err1',
 'koi_steff_err2',
 'koi_slogg',
 'koi_slogg_err1',
 'koi_slogg_err2',
 'koi_srad',
 'koi_srad_err1',
 'koi_srad_err2',
 'ra',
 'dec',
 'koi_kepmag']

In [34]:
!pip install datasets
from tqdm.notebook import tqdm
from typing import Optional, List, Tuple
import datasets
from datasets import Dataset
import matplotlib.pyplot as plt



In [46]:
#create the raw knowledge base
from langchain.docstore.document import Document as LangchainDocument

# first format the data properly
def format_exoplanet_data(row):
    return (
        f"Exoplanet ID: {row['kepid']}\n"
        f"KOI Name: {row['kepoi_name']}\n"
        f"Kepler Name: {row['kepler_name']}\n"
        f"Disposition: {row['koi_disposition']} (Predicted: {row['koi_pdisposition']})\n"
        f"Score: {row['koi_score']}\n"
        f"Not Transit-Like Flag: {row['koi_fpflag_nt']}\n"
        f"Stellar Eclipse Flag: {row['koi_fpflag_ss']}\n"
        f"Centroid Offset Flag: {row['koi_fpflag_co']}\n"
        f"Ephemeris Match Flag: {row['koi_fpflag_ec']}\n"
        f"Orbital Period (days): {row['koi_period']} ± {row['koi_period_err1']}/{row['koi_period_err2']}\n"
        f"Time of Transit Epoch: {row['koi_time0bk']} ± {row['koi_time0bk_err1']}/{row['koi_time0bk_err2']}\n"
        f"Impact Parameter: {row['koi_impact']} ± {row['koi_impact_err1']}/{row['koi_impact_err2']}\n"
        f"Transit Duration (hours): {row['koi_duration']} ± {row['koi_duration_err1']}/{row['koi_duration_err2']}\n"
        f"Transit Depth (ppm): {row['koi_depth']} ± {row['koi_depth_err1']}/{row['koi_depth_err2']}\n"
        f"Planet Radius (Earth radii): {row['koi_prad']} ± {row['koi_prad_err1']}/{row['koi_prad_err2']}\n"
        f"Equilibrium Temperature (K): {row['koi_teq']}\n"
        f"Insolation Flux (Earth flux): {row['koi_insol']} ± {row['koi_insol_err1']}/{row['koi_insol_err2']}\n"
        f"Signal-to-Noise Ratio: {row['koi_model_snr']}\n"
        f"Stellar Effective Temperature (K): {row['koi_steff']} ± {row['koi_steff_err1']}/{row['koi_steff_err2']}\n"
        f"Stellar Surface Gravity: {row['koi_slogg']} ± {row['koi_slogg_err1']}/{row['koi_slogg_err2']}\n"
        f"Stellar Radius (solar radii): {row['koi_srad']} ± {row['koi_srad_err1']}/{row['koi_srad_err2']}\n"
        f"RA: {row['ra']}, Dec: {row['dec']}\n"
        f"Kepler Magnitude: {row['koi_kepmag']}\n"
    )

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(
        page_content=format_exoplanet_data(row),
        metadata={"source": row['kepoi_name']}
    )
    for _, row in df.iterrows()
]


In [None]:
# we are using meta llama 3.2 model 
# Load model directly
from huggingface_hub import HfFolder, whoami
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

#my personal huggingface token for this do not steal >:( I need to figure out how to replace it 

token = 'meta-llama-use'
HfFolder.save_token(token)

#pipeline
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct")
pipe(messages)

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]