# Animal to Dino Transformer

## Imports

In [1]:
%matplotlib widget
import matplotlib.pyplot as plt     # plotting
from matplotlib import cm           # colormaps
import pandas as pd                 # data processing, CSV file I/O
import numpy as np                  # linear algebra
from tqdm import tqdm               # fancy progress bars

#suppress warnings
import warnings
warnings.filterwarnings('ignore')

# pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# load in animal data
animal_df = pd.read_csv('Datasets\Animal-Info.csv')

  animal_df = pd.read_csv('Datasets\Animal-Info.csv')


In [9]:
import ipywidgets as widgets
import pandas as pd
from IPython.display import display
from PIL import Image
import time

import pandas as pd
import numpy as np

import zipfile as zf

images1 = zf.ZipFile("Images/images1.zip", "r")
images1.extractall('images1')

images2 = zf.ZipFile("Images/images2.zip", "r")
images2.extractall('images2')

images1.close()
images2.close()

dinosaur = pd.read_csv('Datasets/jurassicparkwithweights.csv')
distances = pd.read_csv('Datasets/distances.csv')
animal = pd.read_csv('Datasets/Clean-Animal-Info.csv')

animals = pd.read_csv("Datasets/Animal-Info.csv")

## Machine Learning Cleaning and Training

In [10]:
# ML Cleaning
def remove_na(df):
    temp = df
    for column in temp.columns:
        temp = temp[(temp[column] != 'Varies') & (temp[column] != 'Not Applicable')]
    return temp

def clean_column(item):
    item = item.replace(',', '')
    if item[-1] == 'm':
        split = item.split()
        item = str(float(split[2][:-1])*100)
    if item.count('tons') > 0:
        split = item.split()
        item = str(float(split[2])*1000)
        
    out = 0
    split = item.replace(' ', '-').split('-')
    if item.count('-') > 0:
        out = (float(split[0]) + float(split[1]))/2
    if item.count('Up to') > 0:
        out = float(item.split()[-1])/2
    if split[-1] == 'months':
        out *= 30
    if split[-1] == 'weeks':
        out *= 7
    only_nums = lambda x: ''.join([i for i in x if i.isnumeric() or i == '.'])
    
    return out if out else float(only_nums(item))

def z_score(x, based_on=None):
    if based_on is None: based_on = x
    return (x - np.mean(based_on))/np.std(based_on)

def normalize(nums, scale=np.log):
    nums = scale(nums)
    return z_score(nums)

In [11]:
# Further Cleaning

clean_animals = remove_na(animal_df)
columns_to_clean = ['Height (cm)', 'Weight (kg)', 'Lifespan (years)', 
                    'Average Speed (km/h)', 'Top Speed (km/h)', 
                    'Gestation Period (days)']

for column in columns_to_clean:
    category = ' '.join(column.split()[:-1])
    clean_animals = clean_animals.assign(**{category: clean_animals[column].apply(clean_column)})
    clean_animals = clean_animals.assign(**{f'Normal {category}': normalize(clean_animals[category])})

In [12]:
# Data Loading

inputs = ['Normal Height', 'Normal Weight']
outputs = ['Lifespan', 'Average Speed', 'Top Speed', 'Gestation Period']
input_tensors = torch.tensor(np.array(clean_animals[inputs]), dtype=torch.float)
output_tensors = torch.tensor(np.array(clean_animals[outputs]), dtype=torch.float)
X_train, X_test, Y_train, Y_test = train_test_split(input_tensors, output_tensors, test_size=0.1)

BATCH_SIZE = 8
LAYERS = [len(inputs), 10, 10, 10, len(outputs)]
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

In [13]:
# Custom Dataset

class CustomDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs
        
    def __len__(self):
        return self.inputs.shape[0]
    
    def __getitem__(self, idx):
        input = self.inputs[idx]
        output = self.outputs[idx]
        return input, output
    
train_ds = CustomDataset(X_train, Y_train)
test_ds = CustomDataset(X_test, Y_test)

train_dl = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE)
test_dl = DataLoader(dataset=test_ds, batch_size=BATCH_SIZE)

In [14]:
# Training
class Network(nn.Module):
    def __init__(self, dims, stack=None):
        super().__init__()
        if stack:
            self.stack = stack
        else:
            self.stack = nn.Sequential(nn.Dropout(p=0.1))
            for i in range(len(dims)-1):
                self.stack.append(nn.Linear(dims[i], dims[i+1]))
                if i != len(dims)-2:
                    self.stack.append(nn.ReLU())
                    self.stack.append(nn.Dropout(p=0.05))
                    
    def forward(self, x):
        x = x.view(-1, len(inputs))
        return self.stack(x)
    
def run(X, Y, model, loss):
    X, Y = X.to(device), Y.to(device)
    y_pred = model(X)
    y_pred, Y = y_pred.cpu(), Y.cpu()
    error = loss(y_pred, Y)
    return error

def train_epoch(model, optimizer, loss):
    model.train()
    errors = []

    for X, Y in train_dl:
        error = run(X, Y, model, loss)
        errors.append(error.item())

        optimizer.zero_grad()
        error.backward()
        optimizer.step()

    return np.mean(errors)

def evaluate(model, loss):
    model.eval()
    errors = []
    
    with torch.no_grad():
        for X, Y in test_dl:
            error = run(X, Y, model, loss)
            errors.append(error.item())
    
    return np.mean(errors)

def show_plot(model, x, y, z, show_pred=False, show_terrain=False, show_all=False):
    fig = plt.figure()
    norm_x, norm_y, norm_z = f'Normal {x}', f'Normal {y}', f'Normal {z}'
    
    loop_len = range(len(outputs)) if show_all else [outputs.index(z)]
    for i in loop_len:
        if show_all: ax = fig.add_subplot(2, 2, i+1, projection='3d')
        else: ax = fig.add_subplot(projection='3d')

        if show_pred:
            if show_terrain:
                X = np.arange(-5, 5, 0.25)
                Y = np.arange(-5, 5, 0.25)
                grid_X, grid_Y = np.meshgrid(X, Y)
                grid_X = torch.tensor(grid_X, dtype=torch.float)
                grid_Y = torch.tensor(grid_Y, dtype=torch.float)
            
            model.eval()
            with torch.no_grad():
                if show_terrain:
                    Z = model(torch.stack((grid_X, grid_Y), dim=2).reshape(-1, 2)).cpu().numpy()
                    ax.plot_surface(grid_X, grid_Y, Z[:, i].reshape(len(X), len(Y)), cmap=cm.coolwarm)
                else:
                    inputs = torch.tensor(np.array(clean_animals[[norm_x, norm_y]]), dtype=torch.float, device=device)
                    Z = model(inputs).cpu().numpy()
                    ax.scatter(clean_animals[norm_x], clean_animals[norm_y], Z[:, i], marker='o')
            
        ax.scatter(clean_animals[norm_x], clean_animals[norm_y], clean_animals[z], marker='^', color='orange')
        ax.set_xlabel(x)
        ax.set_ylabel(y)
        ax.set_zlabel(z)

    plt.show()
    
    
def Train(learn_rate=0.001, epochs=30, plot_info=True):
    network = Network(LAYERS)
    network = network.to(device)
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(network.parameters(), lr = learn_rate, weight_decay=1e-5)

    train_errors = []
    test_errors = []

    for epoch in (bar := tqdm(range(epochs))):
        batch_error = train_epoch(network, optimizer, loss_fn)
        train_errors.append(batch_error)

        test_error = evaluate(network, loss_fn)
        test_errors.append(test_error)
        
        bar.set_description(f'Error: {test_error:.2f}')

    if plot_info:
        fig, error = plt.subplots(figsize=(14, 5))

        error.plot(train_errors)
        error.plot(test_errors)
        error.legend(['Train', 'Test'])
        error.set_xlabel('Epochs')
        error.set_ylabel('Error')
        error.set_title(f'Train Error: {train_errors[-1]:.2f} Test Error: {test_errors[-1]:.2f}')

    return network



In [17]:
# Final ml stuff
network = Train(epochs=100, plot_info=False)

#show_plot(network, 'Height', 'Weight', 'Lifespan', show_pred=True)

dinos = pd.read_csv('Datasets/jurassicparkwithweights.csv')
clean_dinos = dinos.drop(columns='Unnamed: 0').set_index('name')

def pred_stat(dino, stat, model):
    METERS2CM = 100
    length2height = lambda length: 0.53*length - 0.73   # From Regression
    
    info = clean_dinos[['length', 'weight']].loc[dino]
    info[0] = length2height(info[0])
    
    norm_height = z_score(info[0]*METERS2CM, based_on=clean_animals['Height'])
    norm_weight = z_score(info[1], based_on=clean_animals['Weight'])
    inputs = torch.tensor([norm_height, norm_weight], dtype=torch.float)
    predictions = model(inputs).squeeze()
    return predictions[outputs.index(stat)].item()    


Error: 4216.18: 100%|██████████| 100/100 [00:01<00:00, 57.14it/s]


## Similarity Calculator Cleaning and Calculation

In [18]:
# Clean distances DataFrame
def clean_whitespace(country):
    return country[1:]

distances = distances.assign(Country2=distances['Country2'].apply(clean_whitespace))
distances = distances.set_index('Country1')

In [19]:
# Clean dinosaur DataFrame
def format_diet(diet): 
    conversion = {
        "herbivorous": 'Herbivore',
        "carnivorous": 'Carnivore',
        "omnivorous": 'Omnivore'
    }
    if diet in conversion:
        return conversion[diet]
    return None

def format_height(height):
    height = str(height)
    height = height.strip('m')
    return .53 * float(height)

def clean_countries(country):
    if country == 'USA':
        return "United States"
    elif country == 'North Africa':
        return "Algeria"
    elif country == 'Wales':
        return "United Kingdom"
    else:
        return country
    
def fix_diet(diet):
    insectivore = diet == 'Insectivore' or diet == 'Carnivore, Insectivore'
    piscivore = diet == 'Carnivore, Piscivore' or diet == 'Piscivore'
    scavenger = diet == 'Carnivore, Scavenger'
    filter_feeder = diet == 'Filter Feeder'
    omnivore = diet == 'Herbivore, Omnivore' or diet == 'Omnivore, Herbivore'
    insects = diet == 'Nectar, Insects' or diet == 'Insectivore, Herbivore' or diet == 'Omnivore, Insectivore'
    
    if insectivore or piscivore or scavenger or filter_feeder:
        return 'Carnivore'
    elif omnivore or insects:
        return 'Omnivore'
    else:
        return diet 

def convert_diet(diet):
    if diet == 'Carnivore':
        return 0
    elif diet == 'Omnivore':
        return 1
    elif diet == 'Herbivore':
        return 2    

new_dino = dinosaur.assign(
    Diet = dinosaur['diet'].apply(format_diet),
    Country = dinosaur['lived_in'],
    Height = dinosaur['length'].apply(format_height)
)

final_dino = new_dino[['name', 'Diet','Country', 'Height', 'weight']]
final_dino = final_dino.set_index('name')
final_dino = final_dino.assign(Country=final_dino['Country'].apply(clean_countries))
final_dino = final_dino.dropna()
final_dino = final_dino.assign(Diet=final_dino['Diet'].apply(convert_diet))

In [20]:
# Clean animal DataFrame
def convert_height(height): 
    return height / 100

final_animal = animal[['Height', 'Diet', 'Animal', 'Countries Found', 'Weight']]

final_animal = final_animal.assign(
    Height = final_animal['Height'].apply(convert_height),
    Country = final_animal['Countries Found']
)

final_animal = final_animal.set_index('Animal')

final_animal = final_animal[['Diet', 'Country','Height', 'Weight']]
final_animal = final_animal.assign(Diet=final_animal['Diet'].apply(fix_diet))
final_animal = final_animal.assign(Diet=final_animal['Diet'].apply(convert_diet))

In [21]:
# Normalization
def normalize(x, m):
    return (0.5*m) / (1+np.exp(10/m * (0.5*m-x)))

def find_difference(animal, feature1, feature2, index):
    difference = np.abs(final_animal.loc[animal][feature1]-final_dino[feature2])
    feature_mean = difference.mean()
    delta_feature = np.abs((final_animal.loc[animal][feature1] - final_dino.iloc[index][feature2]))
    out = ((delta_feature) / feature_mean) ** 2
    return out

In [22]:
# Similarity Function
def calculate_similarity(animal):
    similarity_array = np.array([])
    animal_location = final_animal.loc[animal]['Country']
    distances_location = distances.reset_index()
    distances_location = distances_location[distances_location['Country1'] == animal_location]
    
    for i in range(final_dino.shape[0]):
        dist_sum = 0
        #Diet Similarity
        difference = np.abs(final_animal.loc[animal]['Diet']-final_dino['Diet'])
        dist_sum += difference.iloc[i] ** 2
             
        #Height Similarity
        dist_sum += find_difference(animal, 'Height', 'Height', i)
        
        
        #Weight Similarity
        dist_sum += find_difference(animal, 'Weight', 'weight', i)
        
        #Distance Similarity
        delta_dist = distances_location[distances_location['Country2']==final_dino.iloc[i]['Country']]['Distance']
        dist_mean = distances_location['Distance'].mean()

        dist_sum += ((delta_dist) / dist_mean) ** 2
        
        similarity_score = dist_sum ** 0.5
        similarity_array = np.append(similarity_array, similarity_score)
    
    updated_df = final_dino.assign(Similarity_Score=similarity_array)
    return updated_df.sort_values('Similarity_Score', ascending=True).index[0]

## Main Functions For Printing Information

In [28]:
def compare_stats(animal, dino):
    animal_stats = final_animal[['Diet','Height','Weight','Country']].loc[animal]
    dino_stats = final_dino[['Diet','Height','weight','Country']].loc[dino]
    
    convert_diet = ['Carnivore','Omnivore','Herbivore']
    
    # Compare height/weight:
    height_string = animal + "'s are approximately "\
        + str(animal_stats['Height']) + " meters tall "\
        "while " + dino + "'s were approximately "\
        + str(dino_stats['Height']) + " meters tall."
    weight_string = animal + "'s are approximately "\
        + str(animal_stats['Weight']) + " kilograms "\
        "while " + dino + "'s were approximately "\
        + str(dino_stats['weight']) + " kilograms."
    if animal_stats['Diet'] == dino_stats['Diet']:
        diet_string = animal + "'s " + dino + "'s are/were both "\
            + str(convert_diet[animal_stats['Diet']]) + "s!"
    else:
        diet_string = animal + "'s are " + animal_stats['Diet'] + "'s while "\
            + dino + "'s were " + convert_diet[dino_stats['Diet']] + "s."
    if animal_stats['Country'] == dino_stats['Country']:
        country_string = animal + "s and " + dino + "'s are/were both from "\
            + animal_stats['Country'] + "!"
    else:
        country_string = animal + "'s are from " + animal_stats['Country'] + " while "\
            + dino + "'s were from " + dino_stats['Country'] + "."
        
    print(height_string)
    print(weight_string)
    print(diet_string)
    print(country_string)
    
    
def additional_features(dino):
    output_units = ['years', 'km/hr', 'km/hr', 'days']
    for i, output in enumerate(outputs):
        print(f'Predicted {output}: {round(pred_stat(dino, output, network))} ' + output_units[i])


def show_image(organism):
    try:
        display(Image.open("images1/images1/"+ organism +".png"))
    except:
        display(Image.open("images2/images2/"+ organism +".png"))    

def transform_animal(animal):
    most_similar = calculate_similarity(animal)
    print("You put the following animal into the animal-dinosaur tranformer:")
    #time.sleep(1)
    show_image(animal)
    #time.sleep(1)
    print("Say goodbye! Forever!")
    #time.sleep(1)
    print("You shut the door and press the big red button")
    #time.sleep(1)
    for i in range(3):
        print("Transforming...")
        #time.sleep(1)
    print("Something new walks out! It's different, yet somehow similar...")
    #time.sleep(1)
    #similarities = calculate_similarity_rank(animal)
    #name_of_most_similar = similarities.index[0]
    print("Congrats!")
    #time.sleep(1)
    print("You now have a " + most_similar + "!")
    #time.sleep(1)
    print("Here's your new best friend!")
    #time.sleep(1)
    show_image(most_similar)
    compare_stats(animal, most_similar)
    additional_features(most_similar)
    
    
    


## Dropdown and Final Program

### Choose Animal Here

In [29]:
dropdown = widgets.Dropdown(
    options= animals['Animal'],
    value='Aardvark',
    description='Animal',
    disabled=False,
)
        
dropdown

Dropdown(description='Animal', options=('Aardvark', 'Aardwolf', 'African Elephant', 'African Lion', 'African W…

### Run The Code Here

In [30]:
try:
    transform_animal(dropdown.value)
except:
    print("Image not found in database.")

Image not found in database.
