In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, f1_score, mean_squared_error, confusion_matrix

In [2]:
animals_df = pd.read_csv('animals_df_updated.csv')
animals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578 entries, 0 to 1577
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               1578 non-null   object 
 1   Scientific Name    1578 non-null   object 
 2   Class              1578 non-null   object 
 3   Family             1578 non-null   object 
 4   Lifespan (years)   1578 non-null   float64
 5   Top Speed (km/h)   1578 non-null   float64
 6   Weight (kg)        1578 non-null   float64
 7   Length (cm)        1578 non-null   float64
 8   Diet               1578 non-null   int64  
 9   Population Trend   1578 non-null   int64  
 10  Population Status  1578 non-null   int64  
 11  Mating Behavior    1578 non-null   int64  
 12  Arid Climate       1578 non-null   int64  
 13  Cold Climate       1578 non-null   int64  
 14  Polar Climate      1578 non-null   int64  
 15  Temperate Climate  1578 non-null   int64  
 16  Tropical Climate   1578 

In [3]:
animals_df.set_index(keys = "Name", inplace = True)
ml_df = animals_df.drop(labels = ["Scientific Name", "Class", "Family"], axis = 1)

In [4]:
training_features = ml_df.columns[ml_df.columns != "Lifespan (years)"]
target_feature = "Lifespan (years)"

X = ml_df[training_features]
y = ml_df[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 45)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

regressor = LinearRegression().fit(X = X_train, y = y_train)
y_pred = regressor.predict(X_test)

lifespan_comparison_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
lifespan_comparison_df.sample(10)

Unnamed: 0_level_0,Actual,Predicted
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Long-Tailed Goral,16.0,22.587188
Large Indian Civet,17.5,15.500357
American Goldfinch,7.0,12.208468
Gray Seal,30.0,21.353666
Wood Stork,14.5,21.11439
Western Rattlesnake,17.5,14.194927
Rusty-Spotted Cat,18.0,16.106982
Ringed Seal,27.5,22.004902
Eastern Kingbird,10.0,17.048844
Ural Owl,24.0,14.897523


In [5]:
r2_score(y_test, y_pred) * 100

30.919956405336112

In [6]:
mean_squared_error(y_true = y_test, y_pred = y_pred)

168.30881551098673

In [7]:
status_df = ml_df[(ml_df["Population Status"] == 1) | (ml_df["Population Status"] == 2)]

In [8]:
trn_features = status_df.columns[status_df.columns != "Population Status"]
trgt_feature = "Population Status"
X2 = status_df[trn_features]
y2 = status_df[trgt_feature]

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.2, random_state = 33)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train2)
X_test_scaled = scaler.transform(X_test2)

lr = LogisticRegression(max_iter = 279).fit(X = X_train2, y = y_train2)
y_pred2 = lr.predict(X_test2)

status_comparison_df = pd.DataFrame({"Actual": y_test2, "Predicted": y_pred2})
status_comparison_df.sample(10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0_level_0,Actual,Predicted
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Panther Chameleon,1,1
Eastern Barred Bandicoot,2,1
Texas Horned Lizard,1,1
Rock Hyrax,1,1
Blue Duck,2,1
Tufted Deer,1,2
Common Minke Whale,1,1
Eastern Indigo Snake,1,1
Spotted Owl,1,1
Swamp Wallaby,1,1


In [9]:
f1_score(y_test2, y_pred2) * 100

84.61538461538461

According to the confusion matrix, the model managed to predict the correct population status for 241 animals, and for 59 animals he was wrong.

In [10]:
confusion_matrix(y_test2, y_pred2)

array([[198,  23],
       [ 49,  30]], dtype=int64)