In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_llama_model_on_gpu(model_name):
    """Function to load the LLaMA model and tokenizer on GPU (if available)."""
    try:
        print(f"Downloading and loading the model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Load model and move it to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        print("Model downloaded and loaded successfully on GPU.")
        return model, tokenizer, device
    except Exception as e:
        print(f"Failed to download or load the model: {str(e)}")
        return None, None, None

def generate_response(user_input, model, tokenizer, device, max_new_tokens=30):
    """Function to generate a concise, one-line response using the model."""
    try:
        # Define the personality context
        personality_context = (
            "You are a friendly and curious stranger who enjoys having casual conversations about life, "
            "hobbies, dreams, and random trivia. You respond in one concise sentence."
        )

        # Create the prompt with personality context and user's last input
        prompt = f"{personality_context}\nPerson: {user_input}\nYou:"

        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate the response from the model
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_k=50,
                temperature=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract the response after "You:"
        if "You:" in response:
            response = response.split("You:")[-1].strip()
        else:
            # In case "You:" is not in the response, return the whole response
            response = response.strip()

        # Ensure the response is a single line
        response = response.replace('\n', ' ').strip()

        # Limit the response to the first sentence
        response = response.split('. ')[0] + '.'

        return response
    except Exception as e:
        print(f"Error generating response: {str(e)}")
        return "Sorry, I couldn't understand that."

def chat_with_stranger(model, tokenizer, device):
    """Function to simulate a chat with the 'stranger' persona."""

    # Display the context once before the chat begins
    personality_context = (
        "You are now chatting with a friendly and curious stranger at a quiet café. The stranger enjoys "
        "having casual conversations about life, hobbies, dreams, and random trivia. They respond in one concise sentence."
    )

    print("Context:", personality_context)
    print("\nStranger: Hey there! What brings you here today?")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit", "bye"]:
            print("Stranger: It was nice chatting with you. Take care!")
            break

        # Generate a concise response using the model
        response = generate_response(user_input, model, tokenizer, device)

        # Display the response
        print(f"Stranger: {response}")

if __name__ == "__main__":
    # Specify the model you want to download
    model_name = "meta-llama/Llama-3.2-1B"

    # Download and load the model on GPU
    model, tokenizer, device = load_llama_model_on_gpu(model_name)

    if model and tokenizer:
        print("Ready to chat with the stranger!")
        chat_with_stranger(model, tokenizer, device)


Downloading and loading the model: meta-llama/Llama-3.2-1B
Using device: cuda


KeyboardInterrupt: 

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def load_llama_model_on_gpu(model_name):
    """Function to load the LLaMA model and tokenizer on GPU (if available)."""
    try:
        print(f"Downloading and loading the model: {model_name}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Load model and move it to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if device.type == 'cuda':
            print("GPU is available. Using GPU.")
        else:
            print("No GPU detected. Using CPU.")

        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        print("Model downloaded and loaded successfully.")
        return model, tokenizer, device
    except Exception as e:
        print(f"Failed to download or load the model: {str(e)}")
        return None, None, None

def generate_response(user_input, model, tokenizer, device, personality_context, max_new_tokens=30):
    """Function to generate a concise, one-line response using the model."""
    try:
        # Create the prompt with personality context and user's last input
        prompt = f"{personality_context}\nPerson: {user_input}\nYou:"

        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate the response from the model
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_k=50,
                temperature=0.9,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract the response after "You:"
        if "You:" in response:
            response = response.split("You:")[-1].strip()
        else:
            response = response.strip()

        # Ensure the response is a single line and trim to first sentence
        response = response.replace('\n', ' ').strip().split('. ')[0] + '.'

        return response
    except Exception as e:
        print(f"Error generating response: {str(e)}")
        return "Sorry, I couldn't understand that."

def chat_with_stranger(model, tokenizer, device):
    """Function to simulate a chat with the 'stranger' persona."""

    personality_context = (
        "You are a friendly and curious stranger who enjoys having casual conversations about life, "
        "hobbies, dreams, and random trivia. You respond in one concise sentence."
    )

    print("Stranger: Hey there! What brings you here today?")

    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit", "bye"]:
            print("Stranger: It was nice chatting with you. Take care!")
            break

        response = generate_response(user_input, model, tokenizer, device, personality_context)
        print(f"Stranger: {response}")

if __name__ == "__main__":
    model_name = "meta-llama/Llama-3.2-1B"  # Example of another model you could try

    model, tokenizer, device = load_llama_model_on_gpu(model_name)

    if model and tokenizer:
        print("Ready to chat with the stranger!")
        chat_with_stranger(model, tokenizer, device)


Downloading and loading the model: meta-llama/Llama-3.2-1B
GPU is available. Using GPU.
