In [1]:
pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.34 (from langchain_community)
  Downloading langchain_core-0.3.34-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.18 (from langchain_community)
  Downloading langchain-0.3.18-py3-none-any.whl.metadata (7.8 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain_community)
  Downloading SQLAlchemy-2.0.38-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting requests<3,>=2 (from langchain_community)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting PyYAML>=5.3 (from langchain_community)
  Downloading PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain_community)
  Downloading aiohttp-3.11.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting tenacity!

In [None]:
# runnable
import torch
from langchain_community.llms import HuggingFacePipeline
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Set model path
model_path = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-14b/1"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Create a Hugging Face pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=2084,
    temperature=0.7,
    device_map="auto"
)

# Wrap the pipeline in LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

# Define the system prompt for math and AI/ML/DL
system_prompt = """You are an expert in mathematics and AI/ML/DL. You assist with:
1. Mathematical concepts, proofs, and problem-solving.
2. AI/ML/DL theory, algorithms, and applications.
3. Code implementation for AI/ML/DL projects.
4. Debugging and optimizing AI/ML/DL workflows.

Provide clear, concise, and accurate responses. If the question is unclear, ask for clarification."""

# Set up the chat prompt template
prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    HumanMessagePromptTemplate.from_template("{input}")
])

# Use latest LangChain API (RunnableSequence)
chat_chain = prompt_template | llm

# Chat loop
def start_chat():
    print("Math & AI/ML/DL Assistant - Type 'exit' to quit")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Goodbye!")
            break
        response = chat_chain.invoke({"input": user_input})
        print(f"AI: {response}")

# Start chat session
start_chat()


In [2]:
# pip install --target=/kaggle/working langchain_community
import os
import torch
from langchain_community.llms import VLLM
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from vllm import SamplingParams


# Set environment variables (optional but recommended)
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Adjust based on available GPUs
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid tokenizer parallelism conflicts


model_path = "/kaggle/input/deepseek-r1/transformers/deepseek-aideepseek-r1-distill-qwen-14b-awq-neody/1"


# Sampling Parameters for vLLM
sampling_params = SamplingParams(
    temperature=1.0,              # Control randomness
    min_p=0.01,                   # Minimum cumulative probability for nucleus sampling
    skip_special_tokens=True,      # Remove special tokens from output
    max_tokens=8192,               # Maximum output length
)

# ✅ Use LangChain’s VLLM wrapper
llm = VLLM(
    model=model_path,
    dtype="half",                 # Use FP16 for efficiency
    tensor_parallel_size=2,        # Use multiple GPUs (adjust as needed)
    trust_remote_code=True,        # Trust remote execution for model and tokenizer
    max_num_seqs=16,               # Maximum batch size per iteration
    max_model_len=8192,            # Context length
    gpu_memory_utilization=0.95,   # Use 95% of GPU memory
    sampling_params=sampling_params # Pass the sampling params
)

# Define system prompt for AI/ML/DL assistance
system_prompt = """You are an expert in mathematics and AI/ML/DL. You assist with:
1. Mathematical concepts, proofs, and problem-solving.
2. AI/ML/DL theory, algorithms, and applications.
3. Code implementation for AI/ML/DL projects.
4. Debugging and optimizing AI/ML/DL workflows.

Provide clear, concise, and accurate responses. If the question is unclear, ask for clarification."""

# Define chat prompt template
prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    HumanMessagePromptTemplate.from_template("{input}")
])

# ✅ Create LangChain Chat Pipeline with vLLM
chat_chain = prompt_template | llm

# Interactive Chat Loop
def start_chat():
    print("Math & AI/ML/DL Assistant - Type 'exit' to quit")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Goodbye!")
            break
        response = chat_chain.invoke({"input": user_input})
        print(f"AI: {response}")

# Start Chat
start_chat()


ValidationError: 1 validation error for VLLM
  Value error, Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/input/deepseek-r1/transformers/deepseek-aideepseek-r1-distill-qwen-14b-awq-neody/1'. Use `repo_type` argument if needed. [type=value_error, input_value={'model': '/kaggle/input/...gs': {}, 'client': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/value_error

In [None]:
from vllm import LLM, SamplingParams
import os

# Set up environment
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Ensure GPUs are visible

# Define model path (local)
model_path = "/kaggle/input/deepseek-r1/transformers/deepseek-aideepseek-r1-distill-qwen-14b-awq-neody/1"

# Initialize vLLM model
llm = LLM(
    model=model_path,  # ✅ Use vLLM directly (NOT LangChain's VLLM)
    dtype="half",
    tensor_parallel_size=2,  # Use multiple GPUs
    trust_remote_code=True,
)

# Define sampling parameters
sampling_params = SamplingParams(
    temperature=0.7,
    top_p=0.9,
    max_tokens=1024
)

# Generate text using vLLM
outputs = llm.generate(["Explain LangChain in simple terms"], sampling_params)
print(outputs[0].outputs[0].text)


In [None]:
import torch
from vllm import LLM, SamplingParams
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.language_models.llms import LLM as LangChainLLM
import os
import warnings

warnings.simplefilter('ignore')

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set model path
model_path = "/kaggle/input/m/huikang/deepseek-r1/transformers/deepseek-aideepseek-r1-distill-qwen-14b-awq-neody/1"

MAX_NUM_SEQS = 16
MAX_MODEL_LEN = 8192
print('LLM')
llm = LLM(
    model_path,
    dtype="half",                # The data type for the model weights and activations
    max_num_seqs=MAX_NUM_SEQS,   # Maximum number of sequences per iteration. Default is 256
    max_model_len=MAX_MODEL_LEN, # Model context length
    trust_remote_code=True,      # Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer
    tensor_parallel_size=2,      # The number of GPUs to use for distributed execution with tensor parallelism
    gpu_memory_utilization=0.95, # The ratio (between 0 and 1) of GPU memory to reserve for the model
    seed=2024,
)
sampling_params = SamplingParams(
    temperature=1.0,              # randomness of the sampling
    min_p=0.01,
    skip_special_tokens=True,     # Whether to skip special tokens in the output
    max_tokens=MAX_MODEL_LEN,
)

# Custom LangChain wrapper for vLLM
class CustomVLLM(LangChainLLM):
    def _call(self, prompt: str, stop=None) -> str:
        """Generate response using vLLM."""
        outputs = llm.generate([prompt], sampling_params)
        return outputs[0].outputs[0].text.strip()  # Extract generated text

    @property
    def _llm_type(self) -> str:
        return "vllm"

# Instantiate LangChain LLM
llm = CustomVLLM()

# Define the system prompt for math and AI/ML/DL
system_prompt = """You are an expert in mathematics and AI/ML/DL. You assist with:
1. Mathematical concepts, proofs, and problem-solving.
2. AI/ML/DL theory, algorithms, and applications.
3. Code implementation for AI/ML/DL projects.
4. Debugging and optimizing AI/ML/DL workflows.

Provide clear, concise, and accurate responses. If the question is unclear, ask for clarification."""

# Set up the chat prompt template
prompt_template = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(system_prompt),
    HumanMessagePromptTemplate.from_template("{input}")
])

# Use latest LangChain API (RunnableSequence)
chat_chain = prompt_template | llm

# Chat loop
def start_chat():
    print("Math & AI/ML/DL Assistant - Type 'exit' to quit")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Goodbye!")
            break
        response = chat_chain.invoke({"input": user_input})
        print(f"AI: {response}")

# Start chat session
start_chat()


In [None]:
start_chat()

In [None]:
import torch
from vllm import LLM, SamplingParams
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.llms import VLLM
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" # "0,1,2,3"

# Set model path
model_path = "/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-14b/1"

print("CUDA available:", torch.cuda.is_available())
print("Available GPUs:", torch.cuda.device_count())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

# Load vLLM model (automatically uses GPU if available)
llm = VLLM(model=model_path, dtype="half", trust_remote_code=True, tensor_parallel_size=2, device="cuda")


# Define chat prompt
chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an AI assistant. Answer user questions concisely and clearly."),
        ("human", "{input}"),
    ]
)

# Define chat pipeline using LangChain
chat_chain = chat_prompt | llm

# Chat loop
def start_chat():
    print("Math & AI/ML/DL Assistant - Type 'exit' to quit")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Goodbye!")
            break
        response = chat_chain.invoke({"input": user_input})
        print(f"AI: {response}")

# Start chat session
start_chat()
