In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score, mean_squared_error, confusion_matrix

In [2]:
animals_df = pd.read_csv('animals_df_updated.csv')
animals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1576 entries, 0 to 1575
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               1576 non-null   object 
 1   Scientific Name    1576 non-null   object 
 2   Class              1576 non-null   object 
 3   Family             1576 non-null   object 
 4   Lifespan (years)   1576 non-null   float64
 5   Top Speed (km/h)   1576 non-null   float64
 6   Weight (kg)        1576 non-null   float64
 7   Length (cm)        1576 non-null   float64
 8   Diet               1576 non-null   int64  
 9   Population Trend   1576 non-null   int64  
 10  Population Status  1576 non-null   int64  
 11  Mating Behavior    1576 non-null   int64  
 12  Arid Climate       1576 non-null   int64  
 13  Cold Climate       1576 non-null   int64  
 14  Polar Climate      1576 non-null   int64  
 15  Temperate Climate  1576 non-null   int64  
 16  Tropical Climate   1576 

In [3]:
animals_df.set_index(keys = "Name", inplace = True)
ml_df = animals_df.drop(labels = ["Scientific Name", "Class", "Family"], axis = 1)

In [4]:
training_features = ml_df.columns[ml_df.columns != "Lifespan (years)"]
target_feature = "Lifespan (years)"

X = ml_df[training_features]
y = ml_df[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 45)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

regressor = LinearRegression().fit(X = X_train, y = y_train)
y_pred = regressor.predict(X_test)

lifespan_comparison_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
lifespan_comparison_df.sample(10)

Unnamed: 0_level_0,Actual,Predicted
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Australian Pelican,20.0,19.45197
Long-Eared Jerboa,2.5,13.754646
Lear's Macaw,45.0,23.477172
Bearded Seal,31.0,26.475846
Richardson's Ground Squirrel,3.0,11.10943
Sunda Pangolin,18.121027,23.381302
Eastern Coral Snake,7.0,14.582232
Galápagos Fur Seal,22.0,27.072146
Desert Horned Lizard,6.5,9.148017
Egyptian Tortoise,50.0,24.194614


In [5]:
r2_score(y_test, y_pred) * 100

41.14420781592373

In [18]:
mean_squared_error(y_true = y_test, y_pred = y_pred)

147.2282830667223

In [8]:
status_df = ml_df[(ml_df["Population Status"] == 1) | (ml_df["Population Status"] == 2)]

In [9]:
trn_features = status_df.columns[status_df.columns != "Population Status"]
trgt_feature = "Population Status"
X2 = status_df[trn_features]
y2 = status_df[trgt_feature]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 33)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train2)
X_test_scaled = scaler.transform(X_test2)

lr = LogisticRegression(max_iter = 279).fit(X = X_train2, y = y_train2)
y_pred2 = lr.predict(X_test2)

status_comparison_df = pd.DataFrame({"Actual": y_test2, "Predicted": y_pred2})
status_comparison_df.sample(10)

Unnamed: 0_level_0,Actual,Predicted
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Black Rat,1,1
Eastern Coral Snake,1,1
Lear's Macaw,2,1
Red Knot,1,1
Patagonian Mara,1,1
Rough-Toothed Dolphin,1,1
Nicobar Pigeon,1,1
Water Deer,2,1
Resplendent Quetzal,1,1
Ribbon Seal,1,1


In [21]:
f1_score(y_test2, y_pred2) * 100

87.52642706131078

According to the confusion matrix, the model managed to predict the correct population status for 241 animals, and for 59 animals he was wrong.

In [22]:
confusion_matrix(y_test2, y_pred2)

array([[207,  23],
       [ 36,  34]], dtype=int64)