In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ******Avatar Video Generation Pipeline using SadTalker (Kaggle-based)******

This project generates a talking avatar video given a topic using an end-to-end GenAI pipeline. The full pipeline includes avatar creation, script generation, voice synthesis, and animation using SadTalker. Below is a structured breakdown.

In [None]:

!pip install -q diffusers transformers accelerate sentence-transformers gTTS faiss-cpu gradio
!pip install -q git+https://github.com/CompVis/taming-transformers.git
!pip install -q opencv-python==4.7.0.72 face_alignment imageio nest_asyncio

# Step 1 Get Topic Input from User
topic = input(" Enter the topic you want the avatar to explain (e.g., 'nitrogen cycle'): ").strip()
print(f" Generating script for topic: {topic}")

******Avatar Image generation******

here  i have used only the  

In [None]:
#  STEP 2: Generate Avatar Image with better facial symmetry
from diffusers import StableDiffusionPipeline
import torch

def generate_avatar_image(topic_desc):
    prompt = (
        f"high-resolution ultra-realistic photo portrait of a smiling  woman teacher, "#give more features according to the way your avatar has to look like
    )

    negative_prompt = (
        " unrealistic, disfigured face, cartoonish"
    )

    pipe = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    try:
        image = pipe(prompt=prompt, negative_prompt=negative_prompt).images[0]
    except TypeError:
        image = pipe(prompt=prompt).images[0]

    image.save("avatar.jpg")
    print("🖼️ Avatar image saved as avatar.jpg")
    return "avatar.jpg"

# Run it
avatar_path = generate_avatar_image(topic)


       


    




In [None]:
#Check whether the generated avatar image is to your liking if not goto step 2 give different prompt for avatar image or any enhancements that is to be included in the image
from IPython.display import Image, display  # if the face looks a little distorted ignore ,as it will be fixed in step 6 by sad talker face enhancer
display(Image(filename="avatar.jpg"))

# Phase I - Script Generation

In [None]:
#  STEP 3: Generate Script using TinyLLaMA (fixed)
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

def generate_script(topic):
    tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
    model = model.to("cuda")

    gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

    #  Chat-style prompt works better for this model
    prompt = f"<|system|>\nYou are a knowledgeable tutor.\n<|user|>\nExplain '{topic}' in a clear and engaging way.\n<|assistant|>\n"

    result = gen(prompt, max_new_tokens=500, do_sample=True, temperature=0.7)[0]["generated_text"]

    #  Clean generated text after the assistant tag
    script_text = result.split("<|assistant|>")[-1].strip()

    if not script_text or len(script_text) < 20:
        raise ValueError(" Script generation failed: model returned empty or too short response.")

    with open("script.txt", "w") as f:
        f.write(script_text)
    print(" Script generated and saved as script.txt")
    return script_text

# Try again
script_text = generate_script(topic)


# Phase II  -  Voice Generation 

In [None]:
# STEP 4: Generate Voice using gTTS     this was earlier done using edge -tts library which generated different types of voices but edge -tts does not support kaggle 
!pip install -q gTTS

from gtts import gTTS
import os

def generate_voice(script, lang="en"):
    if not script or len(script.strip()) < 10:
        raise ValueError(" Script is empty or too short for TTS.")
    tts = gTTS(text=script, lang=lang)
    tts.save("audio.mp3")
    print(" Audio saved as audio.mp3")
    return "audio.mp3"

# Generate the audio
audio_path = generate_voice(script_text)

# Phase III - Avatar Generation

**Model implementation - Sad talker**

In [None]:
# STEP 5: Clone and Set Up SadTalker
!git clone https://github.com/OpenTalker/SadTalker.git

In [None]:
%cd SadTalker

In [None]:
#  Download pretrained models (for first-time use only)
!bash scripts/download_models.sh

In [None]:
# Install requirements
!pip install -r requirements.txt --quiet

In [None]:
!pip install facexlib --no-deps --quiet
!pip install basicsr --no-deps --quiet

In [None]:
!pip install torch==2.0.1 torchvision==0.15.2 --force-reinstall --quiet

In [None]:
#  Move audio and image to SadTalker folder
import shutil
shutil.copy("/kaggle/working/audio.mp3", "audio.mp3")
shutil.copy("/kaggle/working/avatar.jpg", "avatar.jpg")

In [None]:

!pip install numpy==1.23.5 --force-reinstall --quiet #mandatory step
#After this restart kernal ie select option restart and clear cell outputs  


**Right after installing numpy restart kernel and run only the cells after it(ie from  step 6)**

In [1]:
# step 6
%cd /kaggle/working/SadTalker

/kaggle/working/SadTalker


In [15]:
# Increase recursion limit (SadTalker uses deep calls) if not used maximum depth would reach and would cause error in the final output
import sys
sys.setrecursionlimit(20000)

In [14]:
#  Downgrade imageio to avoid infinite recursion error - mandatory step else it would cause error
!pip uninstall -y imageio
!pip install imageio==2.31.1

Found existing installation: imageio 2.19.3
Uninstalling imageio-2.19.3:
  Successfully uninstalled imageio-2.19.3
Collecting imageio==2.31.1
  Downloading imageio-2.31.1-py3-none-any.whl.metadata (4.7 kB)
Downloading imageio-2.31.1-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: imageio
Successfully installed imageio-2.31.1


In [16]:
# final step it might take 5 or 10 mintues ti gnerate video
!python inference.py \
  --driven_audio audio.mp3 \
  --source_image avatar.jpg \
  --result_dir results \
  --still \
  --enhancer gfpgan         # used for enhacing the avatar
  

using safetensor as default
3DMM Extraction for source image
landmark Det:: 100%|██████████████████████████████| 1/1 [00:00<00:00,  9.51it/s]
3DMM Extraction In Video:: 100%|██████████████████| 1/1 [00:00<00:00, 13.59it/s]
mel:: 100%|██████████████████████████████| 2637/2637 [00:00<00:00, 26340.32it/s]
audio2exp:: 100%|████████████████████████████| 264/264 [00:00<00:00, 464.66it/s]
Face Renderer:: 100%|███████████████████████| 1319/1319 [14:34<00:00,  1.51it/s]
The generated video is named results/2025_06_26_06.53.58/avatar##audio.mp4
face enhancer....
Face Enhancer:: 100%|███████████████████████| 2637/2637 [15:21<00:00,  2.86it/s]
The generated video is named results/2025_06_26_06.53.58/avatar##audio_enhanced.mp4
The generated video is named: results/2025_06_26_06.53.58.mp4
[0m

**The video will be generated in the directory ie kaggle/working/Sadtalker/results which can be accesssed on the right side bar, click on results from that directory and u can find .mp4 video, download it**