In [None]:
! pip install datasets transformers accelerate peft bitsandbytes sentencepiece

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12

In [None]:
import os
import pandas as pd
import requests
import tarfile
import io

# Function to download and extract the CMU Movie Summary Corpus
def download_cmu_movie_corpus():
    print("Downloading CMU Movie Summary Corpus...")
    url = "http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz"
    response = requests.get(url)
    file = tarfile.open(fileobj=io.BytesIO(response.content), mode="r:gz")
    file.extractall(path="movie_data")
    print("Download and extraction complete.")

    # Load plot summaries into a DataFrame
    summaries_path = "movie_data/MovieSummaries/plot_summaries.txt"
    metadata_path = "movie_data/MovieSummaries/movie.metadata.tsv"

    summaries_df = pd.read_csv(summaries_path, sep="\t", header=None,
                              names=["wiki_movie_id", "plot_summary"])

    metadata_df = pd.read_csv(metadata_path, sep="\t", header=None,
                             names=["wiki_movie_id", "freebase_id", "name",
                                    "release_date", "revenue", "runtime",
                                    "languages", "countries", "genres"])

    # Merge the DataFrames
    movies_df = pd.merge(metadata_df, summaries_df, on="wiki_movie_id")

    # Save the processed data
    movies_df.to_csv("movie_data/processed_movies.csv", index=False)
    print(f"Processed data saved with {len(movies_df)} movie records.")

    return movies_df

# Only download if the data doesn't already exist
if not os.path.exists("movie_data/processed_movies.csv"):
    movies_df = download_cmu_movie_corpus()
else:
    movies_df = pd.read_csv("movie_data/processed_movies.csv")
    print(f"Loaded existing dataset with {len(movies_df)} movie records.")


Downloading CMU Movie Summary Corpus...
Download and extraction complete.
Processed data saved with 42204 movie records.


In [None]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import numpy as np

class MoviePlotQA:
    def __init__(self, model_name="deepset/roberta-base-squad2"):
        """
        Initialize the Question Answering system.

        Args:
            model_name (str): The pre-trained model to use for QA
        """
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        self.qa_pipeline = pipeline("question-answering", model=self.model, tokenizer=self.tokenizer)

        self.movies_df = pd.read_csv("movie_data/processed_movies.csv")
        self.current_movie_context = None
        self.movie_title = None

    def search_movie_by_title(self, title):
        """Search for a movie by title and set it as the current context"""
        matches = self.movies_df[self.movies_df['name'].str.contains(title, case=False, na=False)]

        if len(matches) == 0:
            return False, "No movie found with that title."

        # Select the first match
        movie = matches.iloc[0]
        self.current_movie_context = movie['plot_summary']
        self.movie_title = movie['name']

        return True, f"Context set to movie: {self.movie_title}"

    def answer_question(self, question):
        """
        Answer a question based on the current movie context

        Args:
            question (str): The question to answer

        Returns:
            dict: The answer with confidence score
        """
        if self.current_movie_context is None:
            return {"answer": "Please select a movie first using 'set_movie(title)'.",
                    "score": 0.0}

        # Handle context window limitations
        if len(self.current_movie_context) > 512:
            # For long contexts, we can implement more sophisticated handling
            # Here we'll just use the first 512 tokens for simplicity
            context = self.current_movie_context[:2048]  # Approximate character limit
        else:
            context = self.current_movie_context

        result = self.qa_pipeline(question=question, context=context)

        return result

    def get_current_movie(self):
        """Return the current movie title"""
        return self.movie_title if self.movie_title else "No movie selected"


In [None]:
class ConversationalMovieQA(MoviePlotQA):
    def __init__(self, model_name="deepset/roberta-base-squad2"):
        super().__init__(model_name)
        self.conversation_history = []

    def answer_question(self, question):
        """Answer a question with conversation context awareness"""
        # Add the current question to conversation history
        self.conversation_history.append({"role": "user", "content": question})

        # Check for follow-up questions that might require context
        if len(self.conversation_history) > 1 and len(question.split()) < 5:
            # This might be a follow-up question, enrich with context
            prev_question = self.conversation_history[-2]["content"]
            if prev_question.startswith("who") or prev_question.startswith("what"):
                # Append previous answer to provide context
                if "answer" in self.conversation_history[-2]:
                    question = f"{question} about {self.conversation_history[-2]['answer']}"

        # Get the answer using the parent class method
        result = super().answer_question(question)

        # Store the answer in history
        self.conversation_history[-1]["answer"] = result["answer"]
        self.conversation_history[-1]["score"] = result["score"]

        return result

    def clear_conversation(self):
        """Clear the conversation history"""
        self.conversation_history = []
        return "Conversation history cleared."


In [None]:
from transformers import Trainer, TrainingArguments
from datasets import Dataset

def prepare_train_data(movies_df, num_samples=1000):
    """
    Prepare training data for fine-tuning

    Args:
        movies_df: DataFrame containing movie data
        num_samples: Number of training examples to generate

    Returns:
        Dataset: HuggingFace dataset for fine-tuning
    """
    # This is a simplified example - in practice, you'd need real QA pairs
    # Here we're creating synthetic QA pairs about movie plots

    train_data = []

    for i, row in movies_df.sample(n=min(num_samples, len(movies_df))).iterrows():
        plot = row['plot_summary']
        title = row['name']

        # Skip if plot is too short
        if len(plot.split()) < 20:
            continue

        # Create synthetic questions (this is very simplified)
        questions = [
            f"What happens in the movie {title}?",
            f"What is {title} about?",
            f"Can you summarize the plot of {title}?"
        ]

        for q in questions:
            # For extractive QA, we need the answer to be a span of the context
            # Here we just use the first 100 characters as a simple example
            answer_text = plot[:100]
            answer_start = 0

            train_data.append({
                "context": plot,
                "question": q,
                "answers": {
                    "text": [answer_text],
                    "answer_start": [answer_start]
                }
            })

    return Dataset.from_pandas(pd.DataFrame(train_data))

def fine_tune_model(model_name="deepset/roberta-base-squad2", output_dir="fine-tuned-movie-qa"):
    """
    Fine-tune a pre-trained QA model on movie plot data

    Args:
        model_name: Base model to fine-tune
        output_dir: Directory to save the fine-tuned model
    """
    # Load data
    movies_df = pd.read_csv("movie_data/processed_movies.csv")

    # Prepare training data
    train_dataset = prepare_train_data(movies_df)

    # Load model and tokenizer
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=8,
        save_strategy="epoch",
        save_total_limit=2,
        learning_rate=3e-5,
        weight_decay=0.01,
    )

    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
    )

    # Train model
    trainer.train()

    # Save fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"Model fine-tuned and saved to {output_dir}")


In [None]:
# prepare_train_data(movies_df, num_samples=1000)
# fine_tune_model()

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

  trainer = Trainer(


ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [question, context, answers]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:
# Usage example
def main():
    # Initialize the QA system
    qa_system = ConversationalMovieQA()

    print("Movie Plot Question Answering System")
    print("===================================")
    print("Type 'exit' to quit, 'movie: [title]' to set a movie context")

    while True:
        user_input = input("\nQuestion: ")
        s
        if user_input.lower() == 'exit':
            break

        if user_input.lower().startswith('movie:'):
            # Set movie context
            movie_title = user_input[6:].strip()
            success, message = qa_system.search_movie_by_title(movie_title)
            print(message)
        else:
            # Answer question
            try:
                result = qa_system.answer_question(user_input)
                print(f"\nAnswer: {result['answer']}")
                print(f"Confidence: {result['score']:.4f}")
                print(f"Current movie: {qa_system.get_current_movie()}")
            except Exception as e:
                print(f"Error: {str(e)}")

    print("Thank you for using the Movie Plot QA system!")

if __name__ == "__main__":
    main()


Loading model: deepset/roberta-base-squad2


Device set to use cuda:0


Movie Plot Question Answering System
Type 'exit' to quit, 'movie: [title]' to set a movie context

Question: did andy die

Answer: Please select a movie first using 'set_movie(title)'.
Confidence: 0.0000
Current movie: No movie selected

Question: movie: The Shawshank Redemption
Context set to movie: The Shawshank Redemption

Question: did andy die

Answer: Andy Dufresne  is convicted of murdering his wife and her lover
Confidence: 0.1262
Current movie: The Shawshank Redemption

Question: did andy escape

Answer: Brooks
Confidence: 0.0157
Current movie: The Shawshank Redemption

Question: why did andy escape

Answer: unable to adjust to the outside world after 50 years in prison
Confidence: 0.0365
Current movie: The Shawshank Redemption

Question: who was andy

Answer: banker Andy Dufresne
Confidence: 0.1630
Current movie: The Shawshank Redemption

Question: did andy die

Answer: his wife and her lover
Confidence: 0.1605
Current movie: The Shawshank Redemption

Question: who died 

Ans