<a href="https://colab.research.google.com/github/Liorinio/Who-s_that_animal/blob/main/Who's_That_Animal_Application.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and installation

In [1]:
!pip install gradio
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.3.0
    Uninstalling click-8.3.0:
      Successfully uninstalled click-8.3.0
Successfully installed click-8.1.8 gTTS-2.5.4


In [2]:
from gtts import gTTS
import IPython.display as display
import gradio as gr
import pandas as pd
from google.colab import drive
import time
import joblib

from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import timm
from torchvision import models
from torchvision import transforms
from PIL import Image
import io
import json
import os

# Inference and extra additions

In [3]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
def txt_to_dataframe(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        paragraphs = [para.strip() for para in file.read().strip().split("\n\n") if para.strip()]

    df = pd.DataFrame({"Paragraph": paragraphs}, dtype=str)  # Ensure all values are strings
    return df

def say_text(text, lang="iw"):
    tts = gTTS(text=text, lang=lang)  # Convert text to speech
    tts.save("speech.mp3")  # Save the audio file
    display.display(display.Audio("speech.mp3", autoplay=True))  # Play the audio

In [5]:
df = txt_to_dataframe("/content/drive/MyDrive/Who's_That_Animal?/Animals brief descriptions.txt")
df.head()

Unnamed: 0,Paragraph
0,Antelopes are graceful herbivores found primar...
1,Badgers are burrowing mammals recognized by th...
2,Bats are the only mammals capable of sustained...
3,Bears are large mammals found across various c...
4,Bees are flying insects closely related to was...


In [6]:
df.tail()

Unnamed: 0,Paragraph
95,Wolves are social carnivores known for their c...
96,"Wombats are sturdy, burrowing marsupials nativ..."
97,Woodpeckers are birds characterized by their s...
98,"Worms are elongated, soft-bodied invertebrates..."
99,Zebras are African equids known for their dist...


In [7]:
df_animals_names = pd.read_excel("/content/drive/MyDrive/Who's_That_Animal?/animal_names.xlsx", header=None, names=["animals_names"])
df_animals_names.head()

Unnamed: 0,animals_names
0,antelope
1,badger
2,bat
3,bear
4,bee


In [8]:
df_animals_names.tail()

Unnamed: 0,animals_names
95,wolf
96,wombat
97,woodpecker
98,worm
99,zebra


In [9]:
df_animals_classes  = pd.concat([df_animals_names, df], axis=1)
df_animals_classes.head(100)

Unnamed: 0,animals_names,Paragraph
0,antelope,Antelopes are graceful herbivores found primar...
1,badger,Badgers are burrowing mammals recognized by th...
2,bat,Bats are the only mammals capable of sustained...
3,bear,Bears are large mammals found across various c...
4,bee,Bees are flying insects closely related to was...
...,...,...
95,wolf,Wolves are social carnivores known for their c...
96,wombat,"Wombats are sturdy, burrowing marsupials nativ..."
97,woodpecker,Woodpeckers are birds characterized by their s...
98,worm,"Worms are elongated, soft-bodied invertebrates..."


In [10]:
print(df_animals_classes['Paragraph'])

0     Antelopes are graceful herbivores found primar...
1     Badgers are burrowing mammals recognized by th...
2     Bats are the only mammals capable of sustained...
3     Bears are large mammals found across various c...
4     Bees are flying insects closely related to was...
                            ...                        
95    Wolves are social carnivores known for their c...
96    Wombats are sturdy, burrowing marsupials nativ...
97    Woodpeckers are birds characterized by their s...
98    Worms are elongated, soft-bodied invertebrates...
99    Zebras are African equids known for their dist...
Name: Paragraph, Length: 100, dtype: object


In [11]:
# ✅ Define Custom Classifier with 512 input features (from ResNet18) and 95 output classes
class CustomClassifier(nn.Module):
    def __init__(self, in_features, num_classes):
        super(CustomClassifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.4),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        # x shape is (batch_size, 512, 7, 7)
        x = nn.functional.adaptive_avg_pool2d(x, (1, 1))  # Shape: (batch, 512, 1, 1)
        x = x.view(x.size(0), -1)  # Flatten to (batch, 512)
        return self.classifier(x)


# Load the saved LabelEncoder
label_encoder = joblib.load("/content/drive/MyDrive/Who's_That_Animal?/LabelEncoder/labelencoder.pkl")
# Retrieve the class names from the loaded encoder
class_names = list(label_encoder.classes_)

all_labels = class_names
label_encoder.fit(all_labels)


# ✅ Load Pretrained ResNet18 (exclude final FC layer)
backbone = models.resnet18(pretrained=True)

# ✅ Remove the final fully connected layer
modules = list(backbone.children())[:-2]  # Keep up to the last conv layer
feature_extractor = nn.Sequential(*modules)

# ✅ Freeze the backbone
for param in feature_extractor.parameters():
    param.requires_grad = False

# ✅ Create Combined Model: Feature Extractor + Custom Classifier
class CombinedModel(nn.Module):
    def __init__(self, feature_extractor, custom_classifier):
        super(CombinedModel, self).__init__()
        self.feature_extractor = feature_extractor
        self.custom_classifier = custom_classifier

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.custom_classifier(x)
        return x

# ✅ Instantiate new custom classifier and combined model
num_classes = len(label_encoder.classes_)
in_features = 512  # ResNet18 output channels before FC
custom_classifier = CustomClassifier(in_features=in_features, num_classes=num_classes)
model = CombinedModel(feature_extractor, custom_classifier)



Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 109MB/s]


In [12]:
print(class_names)

[np.str_('0'), np.str_('1'), np.str_('10'), np.str_('11'), np.str_('12'), np.str_('13'), np.str_('14'), np.str_('15'), np.str_('16'), np.str_('17'), np.str_('18'), np.str_('19'), np.str_('2'), np.str_('20'), np.str_('21'), np.str_('22'), np.str_('23'), np.str_('24'), np.str_('25'), np.str_('26'), np.str_('27'), np.str_('28'), np.str_('29'), np.str_('3'), np.str_('30'), np.str_('31'), np.str_('32'), np.str_('33'), np.str_('34'), np.str_('35'), np.str_('36'), np.str_('37'), np.str_('38'), np.str_('39'), np.str_('4'), np.str_('40'), np.str_('41'), np.str_('42'), np.str_('43'), np.str_('44'), np.str_('45'), np.str_('46'), np.str_('47'), np.str_('48'), np.str_('49'), np.str_('5'), np.str_('50'), np.str_('51'), np.str_('52'), np.str_('53'), np.str_('54'), np.str_('55'), np.str_('56'), np.str_('57'), np.str_('58'), np.str_('59'), np.str_('6'), np.str_('60'), np.str_('61'), np.str_('62'), np.str_('63'), np.str_('64'), np.str_('65'), np.str_('66'), np.str_('67'), np.str_('68'), np.str_('69'), n

In [13]:
# ✅ Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Define Image Transform (No augmentation for inference)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# ✅ Load Pretrained Model with Custom Classifier
def load_model(model_path, feature_extractor, custom_classifier):
    # Instantiate the CombinedModel
    model = CombinedModel(feature_extractor, custom_classifier)

    # Load the state_dict into the CombinedModel
    model.load_state_dict(torch.load(model_path, map_location=device))

    model.to(device)
    model.eval()
    return model

# Assuming feature_extractor and custom_classifier are defined in the previous cells
# ✅ Load model (make sure the model file is uploaded first)
# Pass the previously defined feature_extractor and custom_classifier to load_model
model = load_model("/content/drive/MyDrive/Who's_That_Animal?/SavedSecondModel/savedSecondModelProjectAnimals.pth", feature_extractor, custom_classifier)

In [14]:
def classify_image(img):
    # Resize and convert to RGB if needed
    img = img.resize((224, 224)).convert("RGB")

    # Apply transformations and move to device (CPU/GPU)
    input_tensor = transform(img).unsqueeze(0).to(device)

    # Disable gradient calculation (inference mode)
    with torch.no_grad():
        outputs = model(input_tensor)
        probabilities = torch.softmax(outputs, dim=1)
        confidence, predicted_class = torch.max(probabilities, 1)

    # Return dictionary: {class_name: confidence_score}
    predicted_label = class_names[predicted_class.item()]
    return {predicted_label: float(confidence.item())}

In [15]:
with open("/content/drive/MyDrive/Who's_That_Animal?/LabelDict/LabelDict.json", 'r') as f:
        label_dict = json.load(f)  # Define label_dict here

def get_animal_label_from_prediction(img):
    predicted_dict = classify_image(img)  # Use classify_image internally
    predicted_key = int(list(predicted_dict.keys())[0])  # Convert string key to int


    # The following line was incorrectly indented
    inv_label_dict = {v: k for k, v in label_dict.items()}
    predicted_label = inv_label_dict.get(predicted_key, "Unknown")
    confidence_score = predicted_dict[str(predicted_key)]
    return {predicted_label: confidence_score}


In [16]:
def classify_and_speak(img):
    # Step 1: Predict animal name and confidence
    prediction = get_animal_label_from_prediction(img)
    animal_name = list(prediction.keys())[0]

    # Step 2: Get description
    text = df_animals_classes.loc[
        df_animals_classes['animals_names'].str.lower() == animal_name.lower(), 'Paragraph'
    ].iloc[0]

    # Step 3: Generate audio
    tts = gTTS(text=text, lang="en")
    audio_path = "/tmp/brief_description.mp3"
    tts.save(audio_path)

    # Pause before playing
    time.sleep(2)

    # Step 4: Return prediction and audio file
    return prediction, audio_path


In [17]:
interface = gr.Interface(
    fn=classify_and_speak,
    inputs=gr.Image(type="pil", label="Upload Animal Image", sources=["upload"]),
    outputs=[
        gr.Label(num_top_classes=1, label="Predicted Animal"),
        gr.Audio(label="Click to Hear Description", autoplay=True)
    ],
    title="🐾 Who's That Animal?",
    description="Upload an animal image to classify it and hear a spoken description.",
    allow_flagging="never"
)

interface.launch()



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://94674a43e1e3742864.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


