This notebook's purpose is to add a weights column to the Jurassic Park dataset, which previously contained only lengths. Another database (dinodatabase) will be used to fill in the missing values. If a dinosaur is in the Jurassic Park dataset but not dinodatabase, a quadratic regression model will be used to predict the weight.

In [None]:
import pandas as pd

In [None]:
# Reads in the data from http://www.dinodatabase.com/dinorcds.asp
# Notable for containing information about height, length, and weight for
# certain dinosaurs. The csv was manually edited in certain areas where the
# formatting was inconsistent
metrics = pd.read_csv("heightslengthsweights.csv")

In [None]:
# Function to be used on the height and length columns to consolidate the
# value to a float that represents meters
def meters_only(numstring):
    new = numstring.split(" ")
    return float(new[2])
# Function to be used on the weight column to consolidate into kilograms
def kg_only(weight):
    new = weight.split(" ")
    return float(new[1].strip("kg.").replace(",", ""))

In [None]:
# Using the functions to clean the columns
met_height = metrics.assign(Height = metrics['Height'].apply(meters_only))
met_length = met_height.assign(Length = met_height['Length'].apply(meters_only))
met_all = met_length.assign(Weight = met_length['Weight'].apply(kg_only))

In [None]:
# Cleans the name to make it consistent with Jurassic Park dataset
met_all = met_all.assign(Name = met_all['Name'].apply(lambda x: x.split("\xa0")[0].lower))

In [None]:
# Using scatter plots to see which variable correlates more to weight, seems
# to be length
met_all.plot(kind = "scatter", x = 'Height', y = 'Weight')
met_all.plot(kind = "scatter", x = 'Length', y = 'Weight')

In [None]:
# Imports numpy in order to use polyfit to find a polynomial regerssion line
import numpy as np

In [None]:
# Finds coefficients for a quadratic regression line. Upon manual review,
# quadratic seemed to work better than cubic or linear
coeffs2 = np.polyfit(met_all['Length'], met_all['Weight'], 2)

In [None]:
# Prediction function using the quadratic coefficients. Some values for
# really small lengths returned negative numbers, so a lower bound is set at 1
def pred_weight2(length):
    a = coeffs2[0]*length**2
    b = coeffs2[1]*length**1
    c = coeffs2[2]
    return max([a + b + c, 1])

In [None]:
# Reads data from https://www.kaggle.com/datasets/kjanjua/jurassic-park-the-exhaustive-dinosaur-dataset
jurassicpark = pd.read_csv("jurassicpark.csv")

In [None]:
# Drops the "m" for meter and turns into a float
jurassicpark = jurassicpark.dropna()
jurassicpark = jurassicpark.assign(length = jurassicpark['length'].apply(lambda x: float(x[:-1])))

In [None]:
# Function for assigning a weight based on the name. 
def assign_weight(name):
    if name in list(met_all['Name']):
        return met_all[met_all['Name'] == name]['Weight'].iloc[0]
    length = jurassicpark[jurassicpark['name'] == name]['length'].iloc[0]
    return pred_weight2(length)

In [None]:
# Creates weight column
jurassicpark = jurassicpark.assign(weight = jurassicpark['name'].apply(assign_weight))

In [None]:
# Exports as new csv
jurassicpark.to_csv("jurassicparkwithweights.csv")