In [440]:
import pandas as pd
import numpy as np
import re

In [441]:
df = pd.read_csv('Animal Dataset.csv')
df.head()

Unnamed: 0,Animal,Height (cm),Weight (kg),Color,Lifespan (years),Diet,Habitat,Predators,Average Speed (km/h),Countries Found,Conservation Status,Family,Gestation Period (days),Top Speed (km/h),Social Structure,Offspring per Birth
0,Aardvark,105-130,40-65,Grey,20-30,Insectivore,"Savannas, Grasslands","Lions, Hyenas",40,Africa,Least Concern,Orycteropodidae,210-240,40,Solitary,1
1,Aardwolf,40-50,8-14,Yellow-brown,10-12,Insectivore,"Grasslands, Savannas","Lions, Leopards",24-30,Eastern and Southern Africa,Least Concern,Hyaenidae,90,40,Solitary,2-5
2,African Elephant,270-310,2700-6000,Grey,60-70,Herbivore,"Savannah, Forest","Lions, Hyenas",25,Africa,Vulnerable,Elephantidae,640-660,40,Herd-based,1
3,African Lion,80-110,120-250,Tan,10-14,Carnivore,"Grasslands, Savannas","Hyenas, Crocodiles",58,Africa,Vulnerable,Felidae,98-105,80,Group-based,2-4 (usually)
4,African Wild Dog,75-80,18-36,Multicolored,10-12,Carnivore,Savannahs,"Lions, Hyenas",56,Sub-Saharan Africa,Endangered,Canidae,70,56,Group-based,10-12


In [442]:
# -------  Data Cleaning Steps --------
# Check for missing values
# Remove the null values
# Average of the different range numbers columns

In [443]:
pd.to_numeric(df['Height (cm)'], errors='coerce').isnull().any()

np.True_

In [444]:
# Remove all the Up to values and convert them to numeric
df['Height (cm)'] = df['Height (cm)'].apply(
    lambda x: re.search(r'\d+', x).group() if isinstance(x, str) and x.strip().lower().startswith('up to') else x
)

In [456]:
# Function to convert "min-max" string to average
def convert_range_to_avg(val):
    try:
        if isinstance(val, (int, float)):
            return float(val)  # already clean, return as-is

        val = str(val).strip().replace(" cm", "").replace("–", "-")  # normalize
        parts = val.split("-")

        if len(parts) == 2:
            return (float(parts[0]) + float(parts[1])) / 2
        return float(parts[0])  # for single numbers in string
    except Exception as e:
        print(f"Error converting '{val}':", e)
        return None

# Apply to Height (cm) column
df["Height (cm)"] = df["Height (cm)"].apply(convert_range_to_avg)
non_numeric = df[~df['Height (cm)'].apply(lambda x: isinstance(x, (int, float)))]

In [446]:


def is_numeric_string(val):
    try:
        float(val)  # Will only work for values like "90", "0.5", etc.
        return True
    except:
        return False

non_numeric_strings = df["Weight (kg)"][~df["Weight (kg)"].apply(is_numeric_string)]

# print("🛑 Non-numeric string values:")
# print(non_numeric_strings)


In [447]:
# Function to clean weight values
def clean_weight_value(val):
    try:
        val = str(val).strip().replace(",", "")  # Remove commas

        # Case 1: Direct number
        try:
            return float(val)
        except:
            pass
        
        # Case 2: Range like "40-65"
        if "-" in val:
            parts = val.split("-")
            if len(parts) == 2:
                return (float(parts[0]) + float(parts[1])) / 2

        # Case 3: "Up to 32" → take the number at the end
        match = re.search(r'(\d+\.?\d*)$', val)
        if match:
            return float(match.group(1))

        # Default: Cannot parse
        return None
    except:
        return None
    
df["Weight (kg)"] = df["Weight (kg)"].apply(clean_weight_value)


In [448]:
# Function to clean weight values
def clean_weight_value(val):
    try:
        val = str(val).strip().replace(",", "")  # Remove commas

        # Case 1: Direct number
        try:
            return float(val)
        except:
            pass
        
        # Case 2: Range like "40-65"
        if "-" in val:
            parts = val.split("-")
            if len(parts) == 2:
                return (float(parts[0]) + float(parts[1])) / 2

        # Case 3: "Up to 32" → take the number at the end
        match = re.search(r'(\d+\.?\d*)$', val)
        if match:
            return float(match.group(1))

        # Default: Cannot parse
        return None
    except:
        return None
    
df["Lifespan (years)"] = df["Lifespan (years)"].apply(clean_weight_value)

In [449]:
# Function to clean Average Speed values
def clean_speed_value(val):
    try:
        val = str(val).strip().replace(",", "")  # Remove commas

        # Case 1: Direct number
        try:
            return float(val)
        except:
            pass
        
        # Case 2: Range like "40-65"
        if "-" in val:
            parts = val.split("-")
            if len(parts) == 2:
                return (float(parts[0]) + float(parts[1])) / 2

        # Case 3: "Up to 32" → take the number at the end
        match = re.search(r'(\d+\.?\d*)$', val)
        if match:
            return float(match.group(1))

        # Default: Cannot parse
        return None
    except:
        return None
    
df["Average Speed (km/h)"] = df["Average Speed (km/h)"].apply(clean_speed_value)

In [450]:
# Function to clean Gestation Period (days) values
def clean_speed_value(val):
    try:
        val = str(val).strip().replace(",", "")  # Remove commas

        # Case 1: Direct number
        try:
            return float(val)
        except:
            pass
        
        # Case 2: Range like "40-65"
        if "-" in val:
            parts = val.split("-")
            if len(parts) == 2:
                return (float(parts[0]) + float(parts[1])) / 2

        # Case 3: "Up to 32" → take the number at the end
        match = re.search(r'(\d+\.?\d*)$', val)
        if match:
            return float(match.group(1))

        # Default: Cannot parse
        return None
    except:
        return None
    
df["Gestation Period (days)"] = df["Gestation Period (days)"].apply(clean_speed_value)

In [451]:
def clean_offspring_value(val):
    try:
        val = str(val).strip().replace(",", "")  # remove commas if any

        # Case 1: Direct number
        try:
            return float(val)
        except:
            pass

        # Case 2: Range like "2-5" or "2-4 (usually)"
        val_cleaned = re.sub(r"[^\d\.-]", "-", val)  # turn junk into -
        parts = [p for p in val_cleaned.split("-") if p.strip().replace(".", "").isdigit()]
        if len(parts) == 2:
            return (float(parts[0]) + float(parts[1])) / 2

        # Case 3: "Up to 25" → take the last number
        match = re.search(r'(\d+\.?\d*)$', val)
        if match:
            return float(match.group(1))

        return None
    except:
        return None
    
df["Offspring per Birth"] = df["Offspring per Birth"].apply(clean_offspring_value)

In [452]:
# Step 1: Remove rows with 'Not Applicable' in 'Top Speed (km/h)'
df = df[df['Top Speed (km/h)'].str.lower() != 'Not Applicable']

# Step 2: Convert ranges like '40-65' → 65 (as float)
def extract_max_speed(val):
    val = str(val).strip()
    if '-' in val:
        return float(val.split('-')[-1])
    else:
        try:
            return float(val)
        except:
            return None

df['Top Speed (km/h)'] = df['Top Speed (km/h)'].apply(extract_max_speed)

In [453]:
df = df.dropna()

In [454]:
# Target is Diet
y = df['Diet']

# Drop 'Animal' and 'Diet' from features
X = df.drop(['Animal', 'Diet'], axis=1)

In [455]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pickle

# Categorical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough'
)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
pipeline.fit(X_train, y_train)

# Save
with open('animal_diet_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
