In [1]:
import pandas as pd
import re
import json

In [2]:
with open("animals'_raw.txt", mode = "r") as f:
    animals_json = f.read()
    
animals_raw = json.loads(animals_json)

In [3]:
#Manual modifying was required because the 'Continents' column contained states within Australia and United States and not continents.
animals_raw[480]["Continents"] = ["Oceania", "North America"]

In [4]:
#DO NOT RUN THIS CELL TWICE!
animals_raw[377]["Diet"].pop()

'Insectivores'

## Animal's general information

In [5]:
animals_name = []
animals_class = []
animals_family = []
animals_scientific_name = []

for animal in animals_raw:
    animals_name.append(animal["Name"])
    animals_class.append(animal["Class"])
    animals_family.append(animal["Family"])
    animals_scientific_name.append(animal["SPECIES"])

## Conversion of string numerical values into numerical values

In [6]:
animals_lifespan = []
animals_topspeed = []
animals_weight = []
animals_length = []

for animal in animals_raw:
    '''
    Animal's lifespan (years)
    '''
    try:
        animal_lifespan = animal["Life Span"]
        lifespan_template1 = r"^(\d*)"
        lifespan_template2 = r"-(\d*)"
        lifespan_template1_search = re.findall(lifespan_template1, animal_lifespan)
        lifespan_template2_search = re.findall(lifespan_template2, animal_lifespan)

        if lifespan_template2_search:
            mean = (float(lifespan_template1_search[0]) + float(lifespan_template2_search[0])) / 2

        else:
            mean = float(lifespan_template1_search[0])

        animals_lifespan.append(round(mean, 2))

    except:
        animals_lifespan.append(None)
        
    '''
    Animal's top speed (km/h)
    '''
    try:
        animal_topspeed = animal["TOP SPEED"]
        topspeed_template1 = r"^(\d*)"
        topspeed_template2 = r"-(\d*)"
        topspeed_template1_search = re.findall(topspeed_template1, animal_topspeed)
        topspeed_template2_search = re.findall(topspeed_template2, animal_topspeed)
        
        if topspeed_template2_search:
            mean = (float(topspeed_template1_search[0]) + float(topspeed_template2_search[0])) / 2

        else:
            mean = float(topspeed_template1_search[0])

        animals_topspeed.append(round(mean, 2))

    except:
        animals_topspeed.append(None)
        
    '''
    Animal's weight (kg)
    '''
    try:
        animal_weight = animal["WEIGHT"]
        weight_template1 = r"^(\d*)"
        weight_template2 = r"-(\d*)"
        weight_template1_search = re.findall(weight_template1, animal_weight)
        weight_template2_search = re.findall(weight_template2, animal_weight)
        
        weight_classification_template = r"([a-z]*)$"
        weight_classification_template_search = re.findall(weight_classification_template, animal_weight)
        
        if weight_template2_search:
            mean = (float(weight_template1_search[0]) + float(weight_template2_search[0])) / 2

        else:
            mean = float(weight_template1_search[0])
            
        if weight_classification_template_search:
            if weight_classification_template_search[0] == "g":
                mean = mean / 1000
                
            elif weight_classification_template_search[0] == "t":
                mean = mean * 1000
                
        animals_weight.append(round(mean, 3))
        
    except:
        animals_weight.append(None)
        
    '''
    Animal's length (cm)
    '''
    try:
        animal_length = animal["LENGTH"]
        length_template1 = r"^(\d*)"
        length_template2 = r"-(\d*)"
        length_template1_search = re.findall(length_template1, animal_length)
        length_template2_search = re.findall(length_template2, animal_length)
        
        length_classification_template = r"([a-z]*)$"
        length_classification_template_search = re.findall(length_classification_template, animal_length)
        
        if length_template2_search:
            mean = (float(length_template1_search[0]) + float(length_template2_search[0])) / 2

        else:
            mean = float(length_template1_search[0])
            
        if length_classification_template_search:
            if length_classification_template_search[0] == "m":
                mean = mean * 100
                
            elif length_classification_template_search[0] == "mm":
                mean = mean / 10
        
        animals_length.append(mean)
        
    except:
        animals_length.append(None)

## Encoding of the rest categorical values

In [7]:
continents_types = []

for animal in animals_raw:
    for continent in animal["Continents"]:
        continents_types.append(continent)
        
continents_types = set(continents_types)

In [8]:
africa = []
antarctica = []
asia = []
europe = []
north_america = []
oceania = []
south_america = []

for animal in animals_raw:
    if "Africa" in animal["Continents"]:
        africa.append(1)
        
    else:
        africa.append(0)
        
    if "Antarctica" in animal["Continents"]:
        antarctica.append(1)
        
    else:
        antarctica.append(0)
        
    if "Asia" in animal["Continents"]:
        asia.append(1)
        
    else:
        asia.append(0)
        
    if "Europe" in animal["Continents"]:
        europe.append(1)
        
    else:
        europe.append(0)
        
    if "North America" in animal["Continents"]:
        north_america.append(1)
        
    else:
        north_america.append(0)
        
    if ("Oceania" in animal["Continents"]) or ("New Zealand" in animal["Continents"]):
        oceania.append(1)
        
    else:
        oceania.append(0)
        
    if "South America" in animal["Continents"]:
        south_america.append(1)
        
    else:
        south_america.append(0)

In [9]:
climates_types = []

for animal in animals_raw:
    for climate in animal["Climate"]:
        climates_types.append(climate)
        
climates_types = set(climates_types)

In [10]:
arid = []
cold = []
polar = []
temperate = []
tropical = []

for animal in animals_raw:
    if "Arid" in animal["Climate"]:
        arid.append(1)
        
    else:
        arid.append(0)
        
    if "Cold" in animal["Climate"]:
        cold.append(1)
        
    else:
        cold.append(0)
        
    if "Polar" in animal["Climate"]:
        polar.append(1)
        
    else:
        polar.append(0)
        
    if "Temperate" in animal["Climate"]:
        temperate.append(1)
        
    else:
        temperate.append(0)
        
    if "Tropical" in animal["Climate"]:
        tropical.append(1)
        
    else:
        tropical.append(0)

The 'Diet' column will be encoded as follows:
* 0 - Omnivore animals
* 1 - Carnivore animals
* 2 - Herbivore animals
* 3 - Insect diet animals

In [11]:
diets_types = []

for animal in animals_raw:
    try:
        for diet in animal["Diet"]:
            diets_types.append(diet)
    
    except:
        pass
        
diets_types = set(diets_types)

In [12]:
carnivore_animals = ["Carnivore", "House Mouse", "Hypercarnivore", "Insectivores", "Mesopredator", "Molluscivore", 
                    "Piscivores", "Predator", "Scavenger"]

herbivore_animals = ["Algivore", "Folivore", "Frugivore", "Graminivore", "Granivore", "Herbivore", "Lignivore", 
                    "Nectarivore", "Palynivore"]

insect_diet_animals = ["Myrmecophagous", "Vermivorous"]

omnivore_animals = ["Omnivore"]

In [13]:
diet = []

for animal in animals_raw:
    carnivore_classification = 0
    herbivore_classification = 0
    insect_classification = 0
    
    for diet_type in animal["Diet"]:
        if diet_type in carnivore_animals:
            carnivore_classification = 1
            
        elif diet_type in herbivore_animals:
            herbivore_classification = 1
            
        elif diet_type in insect_diet_animals:
            insect_classification = 1
            
    if (carnivore_classification + herbivore_classification + insect_classification) > 1:
        diet.append(0)
        
    else:
        if carnivore_classification == 1:
            diet.append(1)
            
        elif herbivore_classification == 1:
            diet.append(2)
            
        elif insect_classification == 1:
            diet.append(3)
            
        else:
            diet.append(0)

In [14]:
trends_types = []

for animal in animals_raw:
    try:
        trends_types.append(animal["Trend"])
        
    except:
        pass

trends_types = set(trends_types)

The 'Trend' column will be encoded as following:
* 0 - None
* 1 - Decreasing
* 2 - Stable
* 3 - Increasing

In [15]:
population_trend = []

for animal in animals_raw:
    if animal["Trend"] == "Increasing":
        population_trend.append(3)
        
    elif animal["Trend"] == "Stable":
        population_trend.append(2)
        
    elif animal["Trend"] == "Decreasing":
        population_trend.append(1)
        
    else:
        population_trend.append(0)

In [16]:
status_types = []

for animal in animals_raw:
    try:
        status_types.append(animal["Status"])
        
    except:
        pass

status_types = set(status_types)

The 'Status' column will be encoded as following:
* 0 - Not enough information
* 1 - Not in danger
* 2 - In danger

In [17]:
no_information = ["Data deficient (DD)", "Not evaluated (NE)"]

not_in_danger = ["Least concern (LC)", "Near Threatened (NT)", "Stable"]

in_danger = ["Critically endangered (CR)", "Decreasing", "Endangered (EN)", "Extinct (EX)", "Extinct in the wild (EW)", 
            "Vulnerable (VU)"]

In [18]:
population_status = []

for animal in animals_raw:
    if animal["Status"] in no_information:
        population_status.append(0)
        
    elif animal["Status"] in not_in_danger:
        population_status.append(1)
        
    else:
        population_status.append(2)

In [19]:
mating_behavior_types = []

for animal in animals_raw:
    try:
        mating_behavior_types.append(animal["Mating Behavior"])
        
    except:
        pass

mating_behavior_types = set(mating_behavior_types)

The 'Mating Behavior' column will be encoded as following:

* 0 - Monogamy 
* 1 - Polyandry
* 2 - Both Monogany and Polyandry

In [20]:
monogany = ['Monogamy','Monogamy, Serial monogamy', 'Serial monogamy']

polyandry = ['Polyandry','Polyandry, Polygyny, Polygynandry','Polygamy','Polygynandry','Polygynandry, Polyandry',
             'Polygynandry, Polygyny', 'Polygyny','Polygyny, Polyandry','Polygyny, Polyandry, Polygynandry',
             'Polygyny, Polygynandry']

both_monogany_polyandry = ['Monogamy, Polyandry','Monogamy, Polygynandry','Monogamy, Polygyny',
                           'Monogamy, Polygyny, Polyandry','Monogamy, Polygyny, Polygynandry','Monogamy, Serial monogamy, Polyandry',
                          'Polygynandry, Serial monogamy' ,'Polygyny, Monogamy','Polygyny, Serial monogamy',
                           'Serial monogamy, Polyandry','Serial monogamy, Polygyny']

In [21]:
mating_behavior = []

for animal in animals_raw:
    if animal["Mating Behavior"] in monogany:
        mating_behavior.append(0)
        
    elif animal["Mating Behavior"] in polyandry:
        mating_behavior.append(1)
        
    else:
        mating_behavior.append(2)

In [22]:
animals_data = {"Name": animals_name,
               "Scientific Name": animals_scientific_name,
               "Class": animals_class,
               "Family": animals_family,
               "Lifespan (years)": animals_lifespan,
               "Top Speed (km/h)": animals_topspeed,
               "Weight (kg)": animals_weight,
               "Length (cm)": animals_length,
               "Diet": diet,
               "Population Trend": population_trend,
               "Population Status": population_status,
               "Mating Behavior": mating_behavior,
               "Arid Climate": arid,
               "Cold Climate": cold,
               "Polar Climate": polar,
               "Temperate Climate": temperate,
               "Tropical Climate": tropical,
               "Africa": africa,
               "Antarctica": antarctica,
               "Asia": asia,
               "Europe": europe,
               "North America": north_america,
               "Oceania": oceania,
               "South America": south_america}

In [23]:
animals_df = pd.DataFrame(data = animals_data)

In [25]:
animals_df.to_csv(path_or_buf = "animals_df.csv", index = False)