# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from collections import Counter

In [3]:
assessments = pd.read_csv("IUCN Plant Data_Bulk/assessments.csv")
taxonomy = pd.read_csv("IUCN Plant Data_Bulk/taxonomy.csv")
habitats = pd.read_csv("IUCN Plant Data_Bulk/habitats.csv")
countries = pd.read_csv("IUCN Plant Data_Bulk/countries.csv")
threats = pd.read_csv("IUCN Plant Data_Bulk/threats.csv")
usetrade = pd.read_csv("IUCN Plant Data_Bulk/usetrade.csv")

  habitats = pd.read_csv("IUCN Plant Data_Bulk/habitats.csv")
  threats = pd.read_csv("IUCN Plant Data_Bulk/threats.csv")


# Filter Data Deficient species

In [4]:
assessments_notDD = assessments.loc[assessments["redlistCategory"] != "Data Deficient"]

In [5]:
assessments_notDD["internalTaxonId"].count()

53657

# Fuse LR/LC & LC, LR/NT & NT, LR/CD & LC

In [6]:
assessments_notDD.replace({
        "Lower Risk/least concern": "Least Concern",
        "Lower Risk/near threatened": "Near Threatened",
        "Lower Risk/conservation dependent": "Least Concern"
    }, inplace=True)
assessments_notDD["redlistCategory"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assessments_notDD.replace({


Least Concern            26657
Endangered                9400
Vulnerable                8959
Critically Endangered     4976
Near Threatened           3500
Extinct                    123
Extinct in the Wild         42
Name: redlistCategory, dtype: int64

In [7]:
assessments_notDD["internalTaxonId"].count()

53657

# assessments
Features: systems, realm, populationTrend
<br>
Target variable: redlistCategory

In [8]:
assessments_features = ["systems", "realm", "populationTrend"]
assessments_notDD.fillna("Unknown", inplace=True)
assessments_encoded = pd.DataFrame(assessments_notDD[assessments_features])
assessments_encoded['systems'] = pd.factorize(assessments_encoded.systems)[0] +1
assessments_encoded['realm'] = pd.factorize(assessments_encoded.realm)[0] +1
assessments_encoded['populationTrend'] = pd.factorize(assessments_encoded.populationTrend)[0] +1
assessments_encoded["internalTaxonId"] = assessments_notDD["internalTaxonId"]
assessments_encoded


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assessments_notDD.fillna("Unknown", inplace=True)


Unnamed: 0,systems,realm,populationTrend,internalTaxonId
0,1,1,1,133722
1,1,2,1,151198
2,1,3,2,151697
3,1,3,1,151700
4,1,2,1,151705
...,...,...,...,...
58338,1,2,3,130047059
58339,1,2,3,131552927
58340,1,39,3,144301060
58341,1,3,1,37996


In [9]:
assessments_encoded["internalTaxonId"].count()

53657

In [10]:
len(set(assessments_encoded.index) & set(assessments_notDD.index))

53657

# taxonomy
Features: phylumName, className, orderName, familyName, genusName

In [11]:
taxonomy_features = ["phylumName", "className", "orderName", "familyName", "genusName"]

taxonomy_encoded = pd.DataFrame(taxonomy[taxonomy_features])
taxonomy_encoded["internalTaxonId"] = taxonomy["internalTaxonId"]
taxonomy_encoded['phylumName'] = pd.factorize(taxonomy_encoded.phylumName)[0] +1
taxonomy_encoded['className'] = pd.factorize(taxonomy_encoded.className)[0] +1
taxonomy_encoded['orderName'] = pd.factorize(taxonomy_encoded.orderName)[0] +1
taxonomy_encoded['familyName'] = pd.factorize(taxonomy_encoded.familyName)[0] +1
taxonomy_encoded['genusName'] = pd.factorize(taxonomy_encoded.genusName)[0] +1

taxonomy_encoded

Unnamed: 0,phylumName,className,orderName,familyName,genusName,internalTaxonId
0,1,1,1,1,1,133722
1,1,2,2,2,2,151198
2,1,2,2,2,3,151697
3,1,2,2,2,4,151700
4,1,2,2,2,5,151705
...,...,...,...,...,...,...
58338,1,2,40,246,1544,130047059
58339,1,2,40,246,1544,131552927
58340,1,2,4,16,158,144301060
58341,1,2,26,201,1550,37996


# habitats
Features: name

In [12]:
habitats.rename(columns={"name": "habitat_name"}, inplace=True)
habitats_features = ["habitat_name"]
habitats.fillna("Unknown", inplace=True)

habitats_encoded = pd.DataFrame(habitats[habitats_features])
habitats_encoded['habitat_name'] = pd.factorize(habitats_encoded.habitat_name)[0] +1

habitats_encoded["internalTaxonId"] = habitats["internalTaxonId"]
habitats_encoded = habitats_encoded.groupby("internalTaxonId").max()
habitats_encoded

Unnamed: 0_level_0,habitat_name
internalTaxonId,Unnamed: 1_level_1
30311,12
30313,12
30314,3
30315,9
30316,31
...,...
208195122,56
208498657,8
208945147,8
209124857,22


# countries
Features: name

In [13]:
countries.rename(columns={"name": "country_name"}, inplace=True)
countries_features = ["country_name"]
countries.fillna("Unknown", inplace=True)

countries_encoded = pd.DataFrame(countries[countries_features])
countries_encoded["country_name"] = pd.factorize(countries_encoded.country_name)[0] + 1

countries_encoded["internalTaxonId"] = countries["internalTaxonId"]
countries_encoded = countries_encoded.groupby("internalTaxonId").max()
countries_encoded

Unnamed: 0_level_0,country_name
internalTaxonId,Unnamed: 1_level_1
30310,34
30311,81
30313,110
30314,34
30315,111
...,...
208195122,142
208498657,182
208945147,3
209124857,123


# threats
Features: name, stressName

In [14]:
threats.rename(columns={"name": "threat_name"}, inplace=True)
threats_features = ["threat_name", "stressName"]
threats.fillna("Unknown", inplace=True)

threats_encoded = pd.DataFrame(threats[threats_features])
threats_encoded["threat_name"] = pd.factorize(threats_encoded.threat_name)[0] + 1
threats_encoded["stressName"] = pd.factorize(threats_encoded.stressName)[0] + 1


threats_encoded["internalTaxonId"] = threats["internalTaxonId"]
threats_encoded = threats_encoded.groupby("internalTaxonId").max()
threats_encoded

Unnamed: 0_level_0,threat_name,stressName
internalTaxonId,Unnamed: 1_level_1,Unnamed: 2_level_1
30310,44,17
30311,18,3
30313,56,222
30314,51,131
30316,39,9
...,...,...
205445399,18,7
205446122,41,7
207984529,5,101
208945147,51,215


# usetrade
Features: name

In [15]:
usetrade.rename(columns={"name": "usetrade_name"}, inplace=True)
usetrade_features = ["usetrade_name"]
usetrade.fillna("Unknown", inplace=True)

usetrade_encoded = pd.DataFrame(usetrade[usetrade_features])
usetrade_encoded["usetrade_name"] = pd.factorize(usetrade_encoded.usetrade_name)[0] + 1

usetrade_encoded["internalTaxonId"] = usetrade["internalTaxonId"]
usetrade_encoded = usetrade_encoded.groupby("internalTaxonId").max()
usetrade_encoded

Unnamed: 0_level_0,usetrade_name
internalTaxonId,Unnamed: 1_level_1
30311,7
30313,5
30314,12
30318,15
30319,10
...,...
205445399,14
207984529,5
208498657,8
208945147,3


# Combining encoded features to one DataFrame

In [16]:
X_preprocessed = assessments_encoded
X_preprocessed = X_preprocessed.merge(taxonomy_encoded, how="left", on="internalTaxonId", suffixes=(None, "_taxonomy"))
X_preprocessed = X_preprocessed.merge(habitats_encoded, how="left", on="internalTaxonId", suffixes=(None, "_habitats"))
X_preprocessed = X_preprocessed.merge(countries_encoded, how="left", on="internalTaxonId", suffixes=(None, "_countries"))
X_preprocessed = X_preprocessed.merge(threats_encoded, how="left", on="internalTaxonId", suffixes=(None, "_threats"))
X_preprocessed = X_preprocessed.merge(usetrade_encoded, how="left", on="internalTaxonId", suffixes=(None, "_usetrade"))
print(X_preprocessed["internalTaxonId"].count())
X_preprocessed

53657


Unnamed: 0,systems,realm,populationTrend,internalTaxonId,phylumName,className,orderName,familyName,genusName,habitat_name,country_name,threat_name,stressName,usetrade_name
0,1,1,1,133722,1,1,1,1,1,1.0,3.0,1.0,1.0,3.0
1,1,2,1,151198,1,2,2,2,2,2.0,4.0,2.0,2.0,5.0
2,1,3,2,151697,1,2,2,2,3,2.0,5.0,4.0,4.0,5.0
3,1,3,1,151700,1,2,2,2,4,3.0,6.0,7.0,6.0,
4,1,2,1,151705,1,2,2,2,5,4.0,4.0,3.0,6.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53652,1,2,3,130047059,1,2,40,246,1544,22.0,16.0,,,12.0
53653,1,2,3,131552927,1,2,40,246,1544,29.0,75.0,,,
53654,1,39,3,144301060,1,2,4,16,158,12.0,16.0,,,
53655,1,3,1,37996,1,2,26,201,1550,8.0,6.0,59.0,334.0,3.0


In [17]:
y = assessments_notDD["redlistCategory"]
y_preprocessed = y.replace({
    "Least Concern": 0,
    "Near Threatened": 1,
    "Vulnerable": 2,
    "Endangered": 3,
    "Critically Endangered": 4,
    "Extinct in the Wild": 5,
    "Extinct": 6
})
y_preprocessed

0        3
1        3
2        3
3        4
4        3
        ..
58338    0
58339    0
58340    0
58341    3
58342    2
Name: redlistCategory, Length: 53657, dtype: int64

## Place entries with NaNs into the Unknown categories

In [18]:
unknown = [x for x in X_preprocessed.columns if "_Unknown" in x]
unknown

[]

In [19]:
X_preprocessed_fillna = X_preprocessed.copy()
X_preprocessed_fillna[unknown].fillna(1, inplace=True)
X_preprocessed_fillna.fillna(0, inplace=True)
X_preprocessed_fillna

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_preprocessed_fillna[unknown].fillna(1, inplace=True)


Unnamed: 0,systems,realm,populationTrend,internalTaxonId,phylumName,className,orderName,familyName,genusName,habitat_name,country_name,threat_name,stressName,usetrade_name
0,1,1,1,133722,1,1,1,1,1,1.0,3.0,1.0,1.0,3.0
1,1,2,1,151198,1,2,2,2,2,2.0,4.0,2.0,2.0,5.0
2,1,3,2,151697,1,2,2,2,3,2.0,5.0,4.0,4.0,5.0
3,1,3,1,151700,1,2,2,2,4,3.0,6.0,7.0,6.0,0.0
4,1,2,1,151705,1,2,2,2,5,4.0,4.0,3.0,6.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53652,1,2,3,130047059,1,2,40,246,1544,22.0,16.0,0.0,0.0,12.0
53653,1,2,3,131552927,1,2,40,246,1544,29.0,75.0,0.0,0.0,0.0
53654,1,39,3,144301060,1,2,4,16,158,12.0,16.0,0.0,0.0,0.0
53655,1,3,1,37996,1,2,26,201,1550,8.0,6.0,59.0,334.0,3.0


In [20]:
X_preprocessed_fillna.to_csv("X_nothotencoded_preprocessed_fillna.csv", index=False)

y_preprocessed_fillna = pd.DataFrame()
y_preprocessed_fillna["redlistCategory"] = y_preprocessed
y_preprocessed_fillna["internalTaxonId"] = assessments_notDD["internalTaxonId"]
y_preprocessed_fillna.to_csv("y_nothotencoded_preprocessed_fillna.csv", index=False)

# Data Balancing
Using Synthetic Minority Oversampling TEchnique (SMOTE)

In [24]:
X_unsplit = X_preprocessed_fillna.merge(y_preprocessed_fillna, how="left", on="internalTaxonId", suffixes=(None, "_redlistCategory"))

X_unbalanced = X_unsplit.drop("redlistCategory", 1)
y_unbalanced = X_unsplit["redlistCategory"]
counter = Counter(y_unbalanced)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=3, n=9400 (17.519%)
Class=4, n=4976 (9.274%)
Class=0, n=26657 (49.680%)
Class=1, n=3500 (6.523%)
Class=2, n=8959 (16.697%)
Class=5, n=42 (0.078%)
Class=6, n=123 (0.229%)


  X_unbalanced = X_unsplit.drop("redlistCategory", 1)


In [27]:
oversampler = SMOTE(k_neighbors=2)
X_sm, y_sm = oversampler.fit_resample (X_unsplit.drop("redlistCategory",1), X_unsplit["redlistCategory"])
X_sm

  X_sm, y_sm = oversampler.fit_resample (X_unsplit.drop("redlistCategory",1), X_unsplit["redlistCategory"])


Unnamed: 0,systems,realm,populationTrend,internalTaxonId,phylumName,className,orderName,familyName,genusName,habitat_name,country_name,threat_name,stressName,usetrade_name
0,1,1,1,133722,1,1,1,1,1,1.000000,3.000000,1.000000,1.000000,3.0
1,1,2,1,151198,1,2,2,2,2,2.000000,4.000000,2.000000,2.000000,5.0
2,1,3,2,151697,1,2,2,2,3,2.000000,5.000000,4.000000,4.000000,5.0
3,1,3,1,151700,1,2,2,2,4,3.000000,6.000000,7.000000,6.000000,0.0
4,1,2,1,151705,1,2,2,2,5,4.000000,4.000000,3.000000,6.000000,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186594,1,42,2,78777385,1,2,12,127,1455,8.000000,16.000000,0.998892,4.994460,0.0
186595,1,3,2,35288,1,2,5,104,1919,41.141756,10.330711,38.443570,13.104993,0.0
186596,1,4,2,86574576,1,4,35,150,1054,9.433563,158.466914,30.965061,48.213341,0.0
186597,1,10,2,113173013,1,2,25,43,5686,9.521482,96.911116,11.348147,7.348147,0.0


In [29]:
X_sm.to_csv("X_sm_allfeatures.csv")
y_sm.to_csv("y_sm_allfeatures.csv")

In [30]:
counter = Counter(y_sm)
for k,v in counter.items():
	per = v / len(y) * 100
	print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=3, n=26657 (49.680%)
Class=4, n=26657 (49.680%)
Class=0, n=26657 (49.680%)
Class=1, n=26657 (49.680%)
Class=2, n=26657 (49.680%)
Class=5, n=26657 (49.680%)
Class=6, n=26657 (49.680%)
