# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
assessments = pd.read_csv("IUCN Plant Data_Bulk/assessments.csv")
taxonomy = pd.read_csv("IUCN Plant Data_Bulk/taxonomy.csv")
habitats = pd.read_csv("IUCN Plant Data_Bulk/habitats.csv")
countries = pd.read_csv("IUCN Plant Data_Bulk/countries.csv")
threats = pd.read_csv("IUCN Plant Data_Bulk/threats.csv")
usetrade = pd.read_csv("IUCN Plant Data_Bulk/usetrade.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


# Filter Data Deficient species

In [3]:
assessments_notDD = assessments.loc[assessments["redlistCategory"] != "Data Deficient"]

In [4]:
assessments_notDD["internalTaxonId"].count()

53657

# Fuse LR/LC & LC, LR/NT & NT, LR/CD & LC

In [5]:
assessments_notDD.replace({
        "Lower Risk/least concern": "Least Concern",
        "Lower Risk/near threatened": "Near Threatened",
        "Lower Risk/conservation dependent": "Least Concern",
        "Extinct in the Wild": "Extinct"
    }, inplace=True)
assessments_notDD["redlistCategory"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


Least Concern            26657
Endangered                9400
Vulnerable                8959
Critically Endangered     4976
Near Threatened           3500
Extinct                    165
Name: redlistCategory, dtype: int64

In [6]:
assessments_notDD["internalTaxonId"].count()

53657

# assessments
Features: systems, realm, populationTrend
<br>
Target variable: redlistCategory

In [7]:
assessments_features = ["systems", "realm", "populationTrend"]
assessments_notDD.fillna("Unknown", inplace=True)

assessments_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
assessments_encoded = assessments_enc.fit_transform(assessments_notDD[assessments_features])
assessments_encoded = pd.DataFrame(assessments_encoded, index=assessments_notDD.index)
assessments_encoded.columns = assessments_enc.get_feature_names_out()
assessments_encoded["internalTaxonId"] = assessments_notDD["internalTaxonId"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [8]:
multiple_systems = [a for a in assessments_encoded.columns if "|" in a and "systems" in a]
multiple_realms = [a for a in assessments_encoded.columns if "|" in a and "realm" in a]

In [9]:
for ms in multiple_systems:
    systems_list = (ms.split("_")[1]).split("|")
    for system in systems_list:
        assessments_encoded["systems_"+system] = assessments_encoded["systems_"+system] + assessments_encoded[ms]

In [10]:
for mr in multiple_realms:
    realms_list = (mr.split("_")[1]).split("|")
    for realm in realms_list:
        assessments_encoded["realm_"+realm] = assessments_encoded["realm_"+realm] + assessments_encoded[mr]

In [11]:
assessments_encoded.drop(columns=multiple_systems+multiple_realms, inplace=True)
assessments_encoded

Unnamed: 0,systems_Freshwater (=Inland waters),systems_Marine,systems_Terrestrial,systems_Unknown,realm_Afrotropical,realm_Antarctic,realm_Australasian,realm_Indomalayan,realm_Nearctic,realm_Neotropical,realm_Oceanian,realm_Palearctic,realm_Unknown,populationTrend_Decreasing,populationTrend_Increasing,populationTrend_Stable,populationTrend_Unknown,internalTaxonId
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,133722
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,151198
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,151697
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,151700
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,151705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58338,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,130047059
58339,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,131552927
58340,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,144301060
58341,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,37996


In [12]:
assessments_encoded["internalTaxonId"].count()

53657

In [13]:
len(set(assessments_encoded.index) & set(assessments_notDD.index))

53657

# taxonomy
Features: phylumName, className, orderName, familyName, genusName

In [14]:
taxonomy_features = ["phylumName", "className", "orderName", "familyName"]

taxonomy_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
taxonomy_encoded = taxonomy_enc.fit_transform(taxonomy[taxonomy_features])
taxonomy_encoded = pd.DataFrame(taxonomy_encoded)
taxonomy_encoded.columns = taxonomy_enc.get_feature_names_out()
taxonomy_encoded["internalTaxonId"] = taxonomy["internalTaxonId"]
taxonomy_encoded

Unnamed: 0,phylumName_ANTHOCEROTOPHYTA,phylumName_BRYOPHYTA,phylumName_CHAROPHYTA,phylumName_CHLOROPHYTA,phylumName_MARCHANTIOPHYTA,phylumName_RHODOPHYTA,phylumName_TRACHEOPHYTA,className_ANDREAEOPSIDA,className_ANTHOCEROTOPSIDA,className_BRYOPSIDA,...,familyName_WINTERACEAE,familyName_WOODSIACEAE,familyName_XERONEMATACEAE,familyName_XIMENIACEAE,familyName_XYRIDACEAE,familyName_ZAMIACEAE,familyName_ZINGIBERACEAE,familyName_ZOSTERACEAE,familyName_ZYGOPHYLLACEAE,internalTaxonId
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133722
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151198
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151697
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151700
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58338,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130047059
58339,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,131552927
58340,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,144301060
58341,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37996


# habitats
Features: name

In [15]:
habitats.rename(columns={"name": "habitat_name"}, inplace=True)
habitats_features = ["habitat_name"]
habitats.fillna("Unknown", inplace=True)

habitats_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
habitats_encoded = habitats_enc.fit_transform(habitats[habitats_features])
habitats_encoded = pd.DataFrame(habitats_encoded)
habitats_encoded.columns = habitats_enc.get_feature_names_out()
habitats_encoded["internalTaxonId"] = habitats["internalTaxonId"]
habitats_encoded = habitats_encoded.groupby("internalTaxonId").max()
habitats_encoded

Unnamed: 0_level_0,habitat_name_Artificial/Aquatic - Aquaculture Ponds,"habitat_name_Artificial/Aquatic - Canals and Drainage Channels, Ditches",habitat_name_Artificial/Aquatic - Excavations (open),habitat_name_Artificial/Aquatic - Irrigated Land (includes irrigation channels),habitat_name_Artificial/Aquatic - Ponds (below 8ha),habitat_name_Artificial/Aquatic - Salt Exploitation Sites,habitat_name_Artificial/Aquatic - Seasonally Flooded Agricultural Land,habitat_name_Artificial/Aquatic - Wastewater Treatment Areas,habitat_name_Artificial/Aquatic - Water Storage Areas (over 8ha),habitat_name_Artificial/Marine - Mari/Brackishculture Ponds,...,habitat_name_Wetlands (inland) - Permanent Rivers/Streams/Creeks (includes waterfalls),"habitat_name_Wetlands (inland) - Permanent Saline, Brackish or Alkaline Lakes","habitat_name_Wetlands (inland) - Permanent Saline, Brackish or Alkaline Marshes/Pools",habitat_name_Wetlands (inland) - Seasonal/Intermittent Freshwater Lakes (over 8ha),habitat_name_Wetlands (inland) - Seasonal/Intermittent Freshwater Marshes/Pools (under 8ha),"habitat_name_Wetlands (inland) - Seasonal/Intermittent Saline, Brackish or Alkaline Lakes and Flats","habitat_name_Wetlands (inland) - Seasonal/Intermittent Saline, Brackish or Alkaline Marshes/Pools",habitat_name_Wetlands (inland) - Seasonal/Intermittent/Irregular Rivers/Streams/Creeks,habitat_name_Wetlands (inland) - Shrub Dominated Wetlands,habitat_name_Wetlands (inland) - Tundra Wetlands (incl. pools and temporary waters from snowmelt)
internalTaxonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208195122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208498657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208945147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209124857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# countries
Features: name

In [16]:
countries.rename(columns={"name": "country_name"}, inplace=True)
countries_features = ["country_name"]
countries.fillna("Unknown", inplace=True)

countries_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
countries_encoded = countries_enc.fit_transform(countries[countries_features])
countries_encoded = pd.DataFrame(countries_encoded)
countries_encoded.columns = countries_enc.get_feature_names_out()
countries_encoded["internalTaxonId"] = countries["internalTaxonId"]
countries_encoded = countries_encoded.groupby("internalTaxonId").max()
countries_encoded

Unnamed: 0_level_0,country_name_Afghanistan,country_name_Albania,country_name_Algeria,country_name_American Samoa,country_name_Andorra,country_name_Angola,country_name_Anguilla,country_name_Antarctica,country_name_Antigua and Barbuda,country_name_Argentina,...,"country_name_Venezuela, Bolivarian Republic of",country_name_Viet Nam,"country_name_Virgin Islands, British","country_name_Virgin Islands, U.S.",country_name_Wallis and Futuna,country_name_Western Sahara,country_name_Yemen,country_name_Zambia,country_name_Zimbabwe,country_name_Åland Islands
internalTaxonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30315,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208195122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208498657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208945147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
209124857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
countries_encoded["countries_count"] = countries.groupby("internalTaxonId").count()["country_name"]
countries_encoded

Unnamed: 0_level_0,country_name_Afghanistan,country_name_Albania,country_name_Algeria,country_name_American Samoa,country_name_Andorra,country_name_Angola,country_name_Anguilla,country_name_Antarctica,country_name_Antigua and Barbuda,country_name_Argentina,...,country_name_Viet Nam,"country_name_Virgin Islands, British","country_name_Virgin Islands, U.S.",country_name_Wallis and Futuna,country_name_Western Sahara,country_name_Yemen,country_name_Zambia,country_name_Zimbabwe,country_name_Åland Islands,countries_count
internalTaxonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
30311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
30313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
30314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
30315,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208195122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
208498657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
208945147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
209124857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


# threats
Features: name, stressName

In [18]:
threats.rename(columns={"name": "threat_name"}, inplace=True)
threats_features = ["threat_name", "stressName"]
threats.fillna("Unknown", inplace=True)

threats_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
threats_encoded = threats_enc.fit_transform(threats[threats_features])
threats_encoded = pd.DataFrame(threats_encoded)
threats_encoded.columns = threats_enc.get_feature_names_out()
threats_encoded["internalTaxonId"] = threats["internalTaxonId"]
threats_encoded = threats_encoded.groupby("internalTaxonId").max()

In [19]:
multiple_stressName = [a for a in threats_encoded.columns if "|" in a and "stressName" in a]
for msn in multiple_stressName:
    stressName_list = (msn.split("_")[1]).split("|")
    for stressName in stressName_list:
        if ("stressName_"+stressName) in threats_encoded.columns:
            threats_encoded["stressName_"+stressName] = threats_encoded["stressName_"+stressName] + threats_encoded[msn]
        else:
            threats_encoded["stressName_"+stressName] = 0
            threats_encoded["stressName_"+stressName] = threats_encoded["stressName_"+stressName] + threats_encoded[msn]

In [20]:
threats_encoded.drop(columns=multiple_stressName, inplace=True)
threats_encoded

Unnamed: 0_level_0,threat_name_Abstraction of ground water (agricultural use),threat_name_Abstraction of ground water (commercial use),threat_name_Abstraction of ground water (domestic use),threat_name_Abstraction of ground water (unknown use),threat_name_Abstraction of surface water (agricultural use),threat_name_Abstraction of surface water (commercial use),threat_name_Abstraction of surface water (domestic use),threat_name_Abstraction of surface water (unknown use),threat_name_Acid rain,threat_name_Agro-industry farming,...,stressName_Indirect species effects,stressName_Loss of mutualism,stressName_Loss of pollinator,stressName_Other,stressName_Reduced reproductive success,stressName_Skewed sex ratios,stressName_Species disturbance,stressName_Species mortality,stressName_Unknown,stressName_Ecosystem stresses
internalTaxonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
30311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
30313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0
30314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
30316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205445399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
205446122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
207984529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0
208945147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


# usetrade
Features: name

In [21]:
usetrade.rename(columns={"name": "usetrade_name"}, inplace=True)
usetrade_features = ["usetrade_name"]
usetrade.fillna("Unknown", inplace=True)

usetrade_enc = OneHotEncoder(handle_unknown="ignore", sparse=False)
usetrade_encoded = usetrade_enc.fit_transform(usetrade[usetrade_features])
usetrade_encoded = pd.DataFrame(usetrade_encoded)
usetrade_encoded.columns = usetrade_enc.get_feature_names_out()
usetrade_encoded["internalTaxonId"] = usetrade["internalTaxonId"]
usetrade_encoded = usetrade_encoded.groupby("internalTaxonId").max()
usetrade_encoded

Unnamed: 0_level_0,usetrade_name_Construction or structural materials,usetrade_name_Establishing ex-situ production *,usetrade_name_Fibre,usetrade_name_Food - animal,usetrade_name_Food - human,usetrade_name_Fuels,"usetrade_name_Handicrafts, jewellery, etc.",usetrade_name_Manufacturing chemicals,usetrade_name_Medicine - human & veterinary,usetrade_name_Other (free text),usetrade_name_Other chemicals,usetrade_name_Other household goods,"usetrade_name_Pets/display animals, horticulture",usetrade_name_Poisons,usetrade_name_Research,usetrade_name_Sport hunting/specimen collecting,usetrade_name_Unknown,"usetrade_name_Wearing apparel, accessories"
internalTaxonId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
30311,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
30314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30318,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30319,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205445399,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
207984529,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
208498657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208945147,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Combining encoded features to one DataFrame

In [22]:
X_preprocessed = assessments_encoded
X_preprocessed = X_preprocessed.merge(taxonomy_encoded, how="left", on="internalTaxonId", suffixes=(None, "_taxonomy"))
X_preprocessed = X_preprocessed.merge(habitats_encoded, how="left", on="internalTaxonId", suffixes=(None, "_habitats"))
X_preprocessed = X_preprocessed.merge(countries_encoded, how="left", on="internalTaxonId", suffixes=(None, "_countries"))
X_preprocessed = X_preprocessed.merge(threats_encoded, how="left", on="internalTaxonId", suffixes=(None, "_threats"))
X_preprocessed = X_preprocessed.merge(usetrade_encoded, how="left", on="internalTaxonId", suffixes=(None, "_usetrade"))
print(X_preprocessed["internalTaxonId"].count())
X_preprocessed

53657


Unnamed: 0,systems_Freshwater (=Inland waters),systems_Marine,systems_Terrestrial,systems_Unknown,realm_Afrotropical,realm_Antarctic,realm_Australasian,realm_Indomalayan,realm_Nearctic,realm_Neotropical,...,usetrade_name_Medicine - human & veterinary,usetrade_name_Other (free text),usetrade_name_Other chemicals,usetrade_name_Other household goods,"usetrade_name_Pets/display animals, horticulture",usetrade_name_Poisons,usetrade_name_Research,usetrade_name_Sport hunting/specimen collecting,usetrade_name_Unknown,"usetrade_name_Wearing apparel, accessories"
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53652,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
53653,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,
53654,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,,,,,,,,,,
53655,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
y = assessments_notDD["redlistCategory"]
y_preprocessed = y.replace({
    "Least Concern": 0,
    "Near Threatened": 1,
    "Vulnerable": 2,
    "Endangered": 3,
    "Critically Endangered": 4,
    "Extinct": 5
})
y_preprocessed

0        3
1        3
2        3
3        4
4        3
        ..
58338    0
58339    0
58340    0
58341    3
58342    2
Name: redlistCategory, Length: 53657, dtype: int64

## Place entries with NaNs into the Unknown categories

In [24]:
unknown = [x for x in X_preprocessed.columns if "_Unknown" in x]
unknown

['systems_Unknown',
 'realm_Unknown',
 'populationTrend_Unknown',
 'habitat_name_Unknown',
 'stressName_Unknown',
 'usetrade_name_Unknown']

In [25]:
X_preprocessed_fillna = X_preprocessed.copy()
X_preprocessed_fillna[unknown].fillna(1, inplace=True)
X_preprocessed_fillna.fillna(0, inplace=True)
X_preprocessed_fillna

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Unnamed: 0,systems_Freshwater (=Inland waters),systems_Marine,systems_Terrestrial,systems_Unknown,realm_Afrotropical,realm_Antarctic,realm_Australasian,realm_Indomalayan,realm_Nearctic,realm_Neotropical,...,usetrade_name_Medicine - human & veterinary,usetrade_name_Other (free text),usetrade_name_Other chemicals,usetrade_name_Other household goods,"usetrade_name_Pets/display animals, horticulture",usetrade_name_Poisons,usetrade_name_Research,usetrade_name_Sport hunting/specimen collecting,usetrade_name_Unknown,"usetrade_name_Wearing apparel, accessories"
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53652,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
53653,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53654,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53655,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#X_preprocessed_fillna.to_csv("X_preprocessed_fillna.csv", index=False)

y_preprocessed_fillna = pd.DataFrame()
y_preprocessed_fillna["redlistCategory"] = y_preprocessed
y_preprocessed_fillna["internalTaxonId"] = assessments_notDD["internalTaxonId"]
#y_preprocessed_fillna.to_csv("y_preprocessed_fillna.csv", index=False)

## Drop entries with NaNs

In [None]:
X_preprocessed_dropna = X_preprocessed.dropna()
X_preprocessed_dropna

In [None]:
y_preprocessed_dropna = pd.DataFrame()
y_preprocessed_dropna["internalTaxonId"] = X_preprocessed_dropna["internalTaxonId"]
y_preprocessed_dropna = y_preprocessed_dropna.merge(y_preprocessed_fillna, how="left", on="internalTaxonId")
y_preprocessed_dropna

In [None]:
X_preprocessed_dropna.to_csv("X_preprocessed_dropna.csv", index=False)
y_preprocessed_dropna.to_csv("y_preprocessed_dropna.csv", index=False)

# Feature selection

In [27]:
low_variance = list(X_preprocessed_fillna.var()[X_preprocessed_fillna.var() == 0].index)

In [28]:
X_preprocessed_fillna.drop(columns=low_variance, inplace=True)

# Data Split
20% for testing

In [29]:
X_unsplit = X_preprocessed_fillna.merge(y_preprocessed_fillna, how="left", on="internalTaxonId", suffixes=(None, "_redlistCategory"))
X_train, X_test = train_test_split(X_unsplit, test_size=0.2, random_state=1)
X_train.to_csv("xy_train_v5.csv")
X_test.to_csv("xy_test_v5.csv")
X_train

Unnamed: 0,systems_Freshwater (=Inland waters),systems_Marine,systems_Terrestrial,systems_Unknown,realm_Afrotropical,realm_Antarctic,realm_Australasian,realm_Indomalayan,realm_Nearctic,realm_Neotropical,...,usetrade_name_Other (free text),usetrade_name_Other chemicals,usetrade_name_Other household goods,"usetrade_name_Pets/display animals, horticulture",usetrade_name_Poisons,usetrade_name_Research,usetrade_name_Sport hunting/specimen collecting,usetrade_name_Unknown,"usetrade_name_Wearing apparel, accessories",redlistCategory
44016,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4
32056,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1941,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
14091,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
13180,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50057,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
32511,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
5192,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
12172,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
