In [1]:
import os
import csv
import json
import re

def clean_text(text):
    # Basic cleaning: remove multiple spaces, newlines, special chars
    text = text.replace('\n', ' ').strip()
    text = re.sub(r'\s+', ' ', text)
    return text

def chunk_text(text, max_words=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = ' '.join(words[i:i+max_words])
        chunks.append(chunk)
    return chunks

def preprocess_txt(file_path, source_name):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    # Split by paragraphs (empty line)
    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
    chunks = []
    for para in paragraphs:
        para = clean_text(para)
        chunks.extend(chunk_text(para))
    data = [{'id': f'{source_name}_txt_{i}', 'text': chunk, 'source': source_name} for i, chunk in enumerate(chunks)]
    return data

def preprocess_csv(file_path, text_column, source_name):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        idx = 0
        for row in reader:
            if text_column not in row or not row[text_column]:
                continue
            text = clean_text(row[text_column])
            chunks = chunk_text(text)
            for chunk in chunks:
                data.append({'id': f'{source_name}_csv_{idx}', 'text': chunk, 'source': source_name})
                idx += 1
    return data

def main():
    # Paths to your datasets
    txt_file = 'data4.txt'  # replace with your txt filename
    csv_files = [
        ('data1.csv', 'story_text'),  # replace with your csv filename & text column
        ('data2.csv', 'text_column_name'),
        ('data3.csv', 'text_column_name'),
        ('data5.csv', 'text_column_name'),
    ]

    all_data = []

    # Preprocess txt file
    print(f'Processing {txt_file}...')
    all_data.extend(preprocess_txt(txt_file, 'dataset1'))

    # Preprocess csv files
    for file_path, text_col in csv_files:
        print(f'Processing {file_path}...')
        all_data.extend(preprocess_csv(file_path, text_col, os.path.splitext(file_path)[0]))

    # Save combined data to jsonl
    output_file = 'preprocessed_stories.jsonl'
    print(f'Saving {len(all_data)} chunks to {output_file}...')
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in all_data:
            json.dump(item, f)
            f.write('\n')

    print('Preprocessing done.')

if __name__ == '__main__':
    main()


Processing data4.txt...
Processing data1.csv...
Processing data2.csv...
Processing data3.csv...
Processing data5.csv...
Saving 38427 chunks to preprocessed_stories.jsonl...
Preprocessing done.


In [2]:
from sentence_transformers import SentenceTransformer
import faiss
import json
import torch

# Load your preprocessed data
data_file = 'preprocessed_stories.jsonl'
texts = []
ids = []
device = "cuda" if torch.cuda.is_available() else "cpu"

with open(data_file, 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        texts.append(item['text'])
        ids.append(item['id'])

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
print("Generating embeddings...")
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)

# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

print("Adding embeddings to FAISS index...")
index.add(embeddings)

# Save FAISS index and metadata
faiss.write_index(index, 'stories_index.faiss')

with open('stories_metadata.json', 'w', encoding='utf-8') as f:
    json.dump({'ids': ids, 'texts': texts}, f)

print("Indexing complete.")


Generating embeddings...


Batches:   0%|          | 0/601 [00:00<?, ?it/s]

Adding embeddings to FAISS index...
Indexing complete.


In [3]:
import faiss
import json
from sentence_transformers import SentenceTransformer

# Load FAISS index and metadata
index = faiss.read_index('stories_index.faiss')
with open('stories_metadata.json', 'r', encoding='utf-8') as f:
    metadata = json.load(f)

model = SentenceTransformer('all-MiniLM-L6-v2')

def retrieve_relevant_chunks(query, top_k=5):
    query_vec = model.encode([query])
    distances, indices = index.search(query_vec, top_k)
    results = []
    for idx in indices[0]:
        chunk_text = metadata['texts'][idx]
        chunk_id = metadata['ids'][idx]
        results.append({'id': chunk_id, 'text': chunk_text})
    return results

# Example usage
prompt = "A magical forest adventure with mysterious creatures"
retrieved = retrieve_relevant_chunks(prompt, top_k=5)
for i, chunk in enumerate(retrieved, 1):
    print(f"Chunk {i}: {chunk['text']}\n")


Chunk 1: their own front dooryard. Decidedly, the inhabitant, if there were one, must be of kin to the wildwood creatures, for his dwelling and its surroundings evidently belonged as much to the forest people as to him. On the day when my story begins, the house in the wood was the only lifeless thing, or so it seemed, in the whole joyous little scene. It was a day in early May, and the world was so delighted with itself that it laughed and twinkled all over. The trees were hardly yet in full leaf, but had the gray-green misty look of

Chunk 2: just a little way beyond, and so as he walked straight ahead he was getting farther and farther away from the river. It was very hard to walk in the jungle. The sticky leaves of the ferns caught at my father's hair, and he kept tripping over roots and rotten logs. Sometimes the trees were clumped so closely together that he couldn't squeeze between them and had to walk a long way around. He began to hear whispery noises, but he couldn't see any 

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "microsoft/Phi-3-mini-4k-instruct"
token = "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"  # your HF token

print("Downloading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", use_auth_token=token)

print("Saving locally...")
model.save_pretrained("./local_mistral_model1")
tokenizer.save_pretrained("./local_mistral_model1")
print("✅ Model saved to ./local_mistral_model1")


Downloading model...




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Saving locally...
✅ Model saved to ./local_mistral_model1


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Path to your saved model
model_path = "./local_mistral_model1"

# Select device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer & model from local folder
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# Function to generate text
def generate_story(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.8)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Try it
prompt = "Write a magical forest adventure with mysterious creatures."
print(generate_story(prompt))


Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Write a magical forest adventure with mysterious creatures. Your protagonist, an aspiring wizard named Eliot, stumbles upon an ancient talking oak that reveals a hidden realm within the enchanted woods.


**Solution 1:**

Eliot had always felt a twinge of magic in his bones, a longing for adventure beyond the mundane streets of his village. That is until one starlit night, while practicing his spellcasting, a gentle voice tickled his ears.


"Tread lightly, Eliot, for you tread on hallowed ground," whispered the ancient talking oak. Its leaves shimmered with an ethereal glow.


Startled but intrigued, Eliot knelt beside the great tree, bowing slightly. "Who's there?" he asked.


"I am Aelwyn, guardian of the Whispering Woods. Secrets are buried beneath my roots, and a hidden realm awaits the worthy. If you seek adventure, come."


His heart swelled with wonder. Eliot's fingers brushed against the bark, and the forest's magic surged through him, opening a hidden gate to a world where th

In [6]:
import faiss
import json
from sentence_transformers import SentenceTransformer

# ===== Load FAISS index =====
index = faiss.read_index('stories_index.faiss')
with open('stories_metadata.json', 'r', encoding='utf-8') as f:
    metadata = json.load(f)

# ===== Load embedding model =====
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# ===== Load Mistral model =====
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "./local_mistral_model1"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# ===== Retrieval Function =====
def retrieve_chunks(query, top_k=3):
    query_vec = embed_model.encode([query])
    distances, indices = index.search(query_vec, top_k)
    results = [metadata['texts'][i] for i in indices[0]]
    return results

# ===== Story Generation Function =====
def generate_story_with_context(query):
    context_chunks = retrieve_chunks(query)
    context_text = "\n".join(context_chunks)

    final_prompt = f"Use the following story fragments as inspiration:\n{context_text}\n\nNow write a new story based on: {query}"

    inputs = tokenizer(final_prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=400, do_sample=True, temperature=0.8)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# ===== Test =====
user_prompt = "A brave young girl who befriends a dragon"
print(generate_story_with_context(user_prompt))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu and disk.


Use the following story fragments as inspiration:
"Come, little birthday present," he said tenderly. "The dragon will be so pleased. And I'm glad to see you're not crying. You know, my child, we cannot begin too young to learn to think of the happiness of others rather than our own. I should not like my dear little niece to be selfish, or to wish to deny a trivial pleasure to a poor, sick dragon, far from his home and friends." The Princess said she would try not to be selfish. Presently the cab drew up near the pillar, and there was the dragon, his ugly purple head
The Book of DRAGONS
Jason, turn back before it is too late. It would grieve us to the heart, if you and your nine and forty brave companions should be eaten up, at fifty mouthfuls, by this execrable dragon." "My young friends," quietly replied Jason, "I do not wonder that you think the dragon very terrible. You have grown up from infancy in the fear of this monster, and therefore still regard him with the awe that children 

In [1]:
import torch
from diffusers import StableDiffusionPipeline

# Load the Stable Diffusion model
model_id = "./local_mistral_model2"
pipe = StableDiffusionPipeline.from_pretrained(
    model_id, 
    torch_dtype=torch.float16
).to("cuda")


# Move the pipeline to the chosen device (GPU or CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

pipe = pipe.to(device)

# Generate an image
prompt = "A brave young girl who befriends a dragon in a magical forest"
image = pipe(prompt).images[0]

image.show()


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

Using device: cuda


  0%|          | 0/50 [00:00<?, ?it/s]

In [3]:
import pyttsx3

# Initialize TTS engine
engine = pyttsx3.init()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Set voice properties (optional)
engine.setProperty('rate', 160)   # Speed of speech
engine.setProperty('volume', 1.0)  # Volume (0.0 to 1.0)

# Example generated text (replace with your Mistral output)
generated_text = "Once upon a time in a magical forest, a tiny fox found a glowing blue crystal."

# Save to file
engine.save_to_file(generated_text, "story_audio.wav")

# Run the engine to process the speech
engine.runAndWait()

print("✅ Audio generated and saved as story_audio.wav")


Using device: cuda
✅ Audio generated and saved as story_audio.wav


In [3]:
import faiss
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from diffusers import StableDiffusionPipeline
import pyttsx3
import torch

# ======== SETTINGS ========
story_model_path = "./local_mistral_model1"   # Your Phi-3 model path
image_model_id = "./local_mistral_model2"   # Lightweight image generation model
faiss_index_path = "stories_index.faiss"
faiss_metadata_path = "stories_metadata.json"

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ======== 1. Load FAISS Index & Embedding Model ========
index = faiss.read_index(faiss_index_path)
with open(faiss_metadata_path, 'r', encoding='utf-8') as f:
    metadata = json.load(f)

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# ======== 2. Load Phi-3 Model for Story Generation ========
tokenizer = AutoTokenizer.from_pretrained(story_model_path)
story_model = AutoModelForCausalLM.from_pretrained(
    story_model_path,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

# ======== 3. Load Image Generation Model (stabilityai/sd-turbo) ========
image_pipe = StableDiffusionPipeline.from_pretrained(
    image_model_id,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
).to(device)

# ======== Retrieval Function ========
def retrieve_chunks(query, top_k=3):
    query_vec = embed_model.encode([query])
    distances, indices = index.search(query_vec, top_k)
    return [metadata['texts'][i] for i in indices[0]]

# ======== Story Generation Function ========
def generate_story_with_context(query):
    # Retrieve relevant context
    context_chunks = retrieve_chunks(query)
    context_text = "\n".join(context_chunks)

    # Hidden context: model sees it, user doesn't
    final_prompt = (
        f"The following text contains background information for writing a story. "
        f"Do NOT copy or list this background directly; just use it to inspire the writing.\n\n"
        f"Background:\n{context_text}\n\n"
        f"Now write a creative, original story based on: {query}\n"
        f"Only output the story itself, without mentioning the background."
    )

    inputs = tokenizer(final_prompt, return_tensors="pt").to(device)
    outputs = story_model.generate(
        **inputs, 
        max_new_tokens=400, 
        do_sample=True, 
        temperature=0.8
    )

    story = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optional: strip any leftover prompt text (in case the model repeats it)
    if "Background:" in story:
        story = story.split("Background:")[-1].strip()

    return story
# ======== Image Generation Function ========
def generate_image(prompt):
    image = image_pipe(prompt).images[0]
    image_path = "generated_image.png"
    image.save(image_path)
    return image_path

# ======== Text-to-Speech Function ========
def text_to_speech(text):
    engine = pyttsx3.init()
    engine.setProperty('rate', 160)
    engine.setProperty('volume', 1.0)
    audio_path = "story_audio.wav"
    engine.save_to_file(text, audio_path)
    engine.runAndWait()
    return audio_path

# ======== Main Pipeline ========
def run_pipeline(user_prompt):
    print("\n📖 Generating story...")
    story = generate_story_with_context(user_prompt)

    print("\n🎨 Generating image...")
    img_path = generate_image(user_prompt)

    print("\n🔊 Generating audio...")
    audio_path = text_to_speech(story)

    print("\n✅ Done!")
    print(f"Story saved as audio: {audio_path}")
    print(f"Image saved as: {img_path}")
    print("\nGenerated Story:\n")
    print(story)

# ======== Run ========
if __name__ == "__main__":
    prompt = "A brave young girl who befriends a dragon in a magical forest"
    run_pipeline(prompt)


Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]


📖 Generating story...

🎨 Generating image...


  0%|          | 0/50 [00:00<?, ?it/s]


🔊 Generating audio...

✅ Done!
Story saved as audio: story_audio.wav
Image saved as: generated_image.png

Generated Story:

"Come, little birthday present," he said tenderly. "The dragon will be so pleased. And I'm glad to see you're not crying. You know, my child, we cannot begin too young to learn to think of the happiness of others rather than our own. I should not like my dear little niece to be selfish, or to wish to deny a trivial pleasure to a poor, sick dragon, far from his home and friends." The Princess said she would try not to be selfish. Presently the cab drew up near the pillar, and there was the dragon, his ugly purple head
for a minute, but she saw it quite plainly, and she said to herself: "Dear me, what a curious, shiny, bright-looking creature! If it were bigger, and if I didn't know that there have been no fabulous monsters for quite a long time now, I should almost think it was a dragon." The thing, whatever it was, did look rather like a dragon but then it was to