In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Details about the data
The dataset is available at: http://archive.ics.uci.edu/ml/datasets/Amphibians# . The dataset consists of a mix of categorical and numerical data. The dataset is used to predict whether or not there exists specific amphibian species near certain water reservoirs dependent on a variety of different features in the data. The features in the data consist of the different surroundings around the reservoirs; the number of reservoirs that exist in a specific habitat; the reservoir type; whether or not fishing takes place; access to the resevoirs by rural areas; the status of maintaintance of the reservoir and the type of shore.

Reading amphibians data

In [2]:
df = pd.read_csv('dataset.csv')
df

Unnamed: 0,Integer;Categorical;Numerical;Numerical;Categorical;Categorical;Categorical;Categorical;Categorical;Categorical;Categorical;Numerical;Ordinal;Ordinal;Categorical;Categorical;Label 1;Label 2;Label 3;Label 4;Label 5;Label 6;Label 7
0,ID;Motorway;SR;NR;TR;VR;SUR1;SUR2;SUR3;UR;FR;O...
1,1;A1;600;1;1;4;6;2;10;0;0;50;0;0;0;1;0;0;0;0;0...
2,2;A1;700;1;5;1;10;6;10;3;1;75;1;1;0;1;0;1;1;0;...
3,3;A1;200;1;5;1;10;6;10;3;4;75;1;1;0;1;0;1;1;0;...
4,4;A1;300;1;5;0;6;10;2;3;4;25;0;0;0;1;0;0;1;0;0...
...,...
185,185;S52;2300;1;12;3;2;2;1;0;0;75;2;1;0;1;0;1;0...
186,186;S52;300;1;14;2;7;10;2;0;0;100;5;5;0;1;1;1;...
187,187;S52;500;1;1;4;1;10;2;0;0;100;5;5;0;1;1;1;1...
188,188;S52;300;1;12;3;2;1;6;0;0;100;1;0;0;1;0;1;1...


Fixing data

### Observations:
The data is not well aligned. We need to align the dataset properly so that each column is aligned to its respective columns. 

In [3]:
df = df['Integer;Categorical;Numerical;Numerical;Categorical;Categorical;Categorical;Categorical;Categorical;Categorical;Categorical;Numerical;Ordinal;Ordinal;Categorical;Categorical;Label 1;Label 2;Label 3;Label 4;Label 5;Label 6;Label 7'].str.split(';', expand=True)
print(df.loc[:, 1:23].rename(columns={1:'ID', 2:'Motorway', 3:'SR',4:'NR',5:'TR', 6:'VR', 7:'SUR1',8:'SUR2',9:'SUR3', 10:'UR',
                                      11:'FR',12:'OR',13:'RR', 14:'BR', 15:'MR',16:'CR',
                                      17:'Green frogs',18:'Brown frogs',19:'Common toad', 20:'Fire-bellied toad', 21:'Tree frog',22:'Common newt',23:'Great crested newt'}) )

           ID Motorway  SR  NR  TR    VR  SUR1  SUR2 SUR3  UR  ...  RR  BR  \
0    Motorway       SR  NR  TR  VR  SUR1  SUR2  SUR3   UR  FR  ...  BR  MR   
1          A1      600   1   1   4     6     2    10    0   0  ...   0   0   
2          A1      700   1   5   1    10     6    10    3   1  ...   1   0   
3          A1      200   1   5   1    10     6    10    3   4  ...   1   0   
4          A1      300   1   5   0     6    10     2    3   4  ...   0   0   
..        ...      ...  ..  ..  ..   ...   ...   ...  ...  ..  ...  ..  ..   
185       S52     2300   1  12   3     2     2     1    0   0  ...   1   0   
186       S52      300   1  14   2     7    10     2    0   0  ...   5   0   
187       S52      500   1   1   4     1    10     2    0   0  ...   5   0   
188       S52      300   1  12   3     2     1     6    0   0  ...   0   0   
189       S52      300   1  12   3     2     6    10    0   0  ...   1   0   

     MR           CR  Green frogs  Brown frogs        Common to

In [4]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,ID,Motorway,SR,NR,TR,VR,SUR1,SUR2,SUR3,UR,...,BR,MR,CR,Green frogs,Brown frogs,Common toad,Fire-bellied toad,Tree frog,Common newt,Great crested newt
1,1,A1,600,1,1,4,6,2,10,0,...,0,0,1,0,0,0,0,0,0,0
2,2,A1,700,1,5,1,10,6,10,3,...,1,0,1,0,1,1,0,0,1,0
3,3,A1,200,1,5,1,10,6,10,3,...,1,0,1,0,1,1,0,0,1,0
4,4,A1,300,1,5,0,6,10,2,3,...,0,0,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,185,S52,2300,1,12,3,2,2,1,0,...,1,0,1,0,1,0,0,0,0,0
186,186,S52,300,1,14,2,7,10,2,0,...,5,0,1,1,1,1,1,0,1,0
187,187,S52,500,1,1,4,1,10,2,0,...,5,0,1,1,1,1,1,0,1,0
188,188,S52,300,1,12,3,2,1,6,0,...,0,0,1,0,1,1,0,0,0,0


In [5]:
df_new=df.rename(columns=df.iloc[0])
new_df=df_new.drop(df_new.index[0])
new_df

Unnamed: 0,ID,Motorway,SR,NR,TR,VR,SUR1,SUR2,SUR3,UR,...,BR,MR,CR,Green frogs,Brown frogs,Common toad,Fire-bellied toad,Tree frog,Common newt,Great crested newt
1,1,A1,600,1,1,4,6,2,10,0,...,0,0,1,0,0,0,0,0,0,0
2,2,A1,700,1,5,1,10,6,10,3,...,1,0,1,0,1,1,0,0,1,0
3,3,A1,200,1,5,1,10,6,10,3,...,1,0,1,0,1,1,0,0,1,0
4,4,A1,300,1,5,0,6,10,2,3,...,0,0,1,0,0,1,0,0,0,0
5,5,A1,600,2,1,4,10,2,6,0,...,5,0,1,0,1,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,185,S52,2300,1,12,3,2,2,1,0,...,1,0,1,0,1,0,0,0,0,0
186,186,S52,300,1,14,2,7,10,2,0,...,5,0,1,1,1,1,1,0,1,0
187,187,S52,500,1,1,4,1,10,2,0,...,5,0,1,1,1,1,1,0,1,0
188,188,S52,300,1,12,3,2,1,6,0,...,0,0,1,0,1,1,0,0,0,0


In [6]:
new_df.columns

Index(['ID', 'Motorway', 'SR', 'NR', 'TR', 'VR', 'SUR1', 'SUR2', 'SUR3', 'UR',
       'FR', 'OR', 'RR', 'BR', 'MR', 'CR', 'Green frogs', 'Brown frogs',
       'Common toad', 'Fire-bellied toad', 'Tree frog', 'Common newt',
       'Great crested newt'],
      dtype='object')

Checking for missing values

In [7]:
new_df.isnull().sum()

ID                    0
Motorway              0
SR                    0
NR                    0
TR                    0
VR                    0
SUR1                  0
SUR2                  0
SUR3                  0
UR                    0
FR                    0
OR                    0
RR                    0
BR                    0
MR                    0
CR                    0
Green frogs           0
Brown frogs           0
Common toad           0
Fire-bellied toad     0
Tree frog             0
Common newt           0
Great crested newt    0
dtype: int64

We do not need the ID and Motorway columns so we drop them.

In [8]:
del new_df['ID']
del new_df['Motorway']
new_df.head()

Unnamed: 0,SR,NR,TR,VR,SUR1,SUR2,SUR3,UR,FR,OR,...,BR,MR,CR,Green frogs,Brown frogs,Common toad,Fire-bellied toad,Tree frog,Common newt,Great crested newt
1,600,1,1,4,6,2,10,0,0,50,...,0,0,1,0,0,0,0,0,0,0
2,700,1,5,1,10,6,10,3,1,75,...,1,0,1,0,1,1,0,0,1,0
3,200,1,5,1,10,6,10,3,4,75,...,1,0,1,0,1,1,0,0,1,0
4,300,1,5,0,6,10,2,3,4,25,...,0,0,1,0,0,1,0,0,0,0
5,600,2,1,4,10,2,6,0,0,99,...,5,0,1,0,1,1,1,0,1,1


In [9]:
print("There exists " + str(len(new_df)) + " observations in the dataset")

There exists 189 observations in the dataset


In [10]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189 entries, 1 to 189
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   SR                  189 non-null    object
 1   NR                  189 non-null    object
 2   TR                  189 non-null    object
 3   VR                  189 non-null    object
 4   SUR1                189 non-null    object
 5   SUR2                189 non-null    object
 6   SUR3                189 non-null    object
 7   UR                  189 non-null    object
 8   FR                  189 non-null    object
 9   OR                  189 non-null    object
 10  RR                  189 non-null    object
 11  BR                  189 non-null    object
 12  MR                  189 non-null    object
 13  CR                  189 non-null    object
 14  Green frogs         189 non-null    object
 15  Brown frogs         189 non-null    object
 16  Common toad         189 no

### Observation:

According to the information about the data, we have 22 columns. The observed data types for each column is of type object. We need to convert the data types for each column to their respective correct data type.

The following can remain features of type object:
- Motorway   
- TR (Type of water reservoirs)  
- VR  (Presebce of vegetation)
- SUR1 (Dominant types of land cover surrounding the water reservoir)
- SUR2 (Second most dominant types of land cover surrounding the water reservoir)
- SUR3 (Third most dominant types of land cover surrounding the water reservoir)
- UR (Use of water reservoir)
- FR (Presence of fishing) 
- Green frogs
- Brown frogs  
- Common toad 
- Fire-bellied toad  
- Tree frog 
- Common newt
- Great crested newt  

The following features should be of type Integer:
- RR (Minimum distance from reservoir to roads)                    
- BR (Building development)

The following features should be of type Float:
- SR (Surface of water reservoir)                    
- NR (Number of water reservoirs - $m^2$)                    
- OR (Percentage of access from the edges of the water reservoir to underdeveloped areas
- MR (Maintainance status of reservoir)
- CR (Shore type)



### Convert to the right data types

The following is a function to convert to either an integer or a float.

In [11]:
def conversion(data,feature,convertTo):
    
    # temp variable
    temp_feature = data[[feature]]
    
    # perform th conversion
    temp_feature = temp_feature.astype({feature: convertTo })
    
    # drop original feature
    dataTemp1 = data.drop(columns=[feature])
    # create a new dataset with the converted field
    dataTemp2 = pd.merge(temp_feature,dataTemp1, left_index=True, right_index=True)
    
    # return converted dataset
    return dataTemp2
print("Finished processing function to convert to another data type")

Finished processing function to convert to another data type


In [12]:
# Convert to integers
df1 = conversion(new_df,"RR",'int')
df2 = conversion(df1,"BR",'int')

# Convert to Floats
df3 = conversion(df2,"SR",'Float64')
df4 = conversion(df3,"NR",'Float64')
df5 = conversion(df4,"OR",'Float64')
df6 = conversion(df5,"MR",'Float64')
new_data = conversion(df6,"CR",'Float64')

print("Finished converting features to correct data types")

Finished converting features to correct data types


Use the function the above to convert the field in the data to the correct data types . The new data formed will be stored in $new$_$data$

In [13]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189 entries, 1 to 189
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CR                  189 non-null    float64
 1   MR                  189 non-null    float64
 2   OR                  189 non-null    float64
 3   NR                  189 non-null    float64
 4   SR                  189 non-null    float64
 5   BR                  189 non-null    int32  
 6   RR                  189 non-null    int32  
 7   TR                  189 non-null    object 
 8   VR                  189 non-null    object 
 9   SUR1                189 non-null    object 
 10  SUR2                189 non-null    object 
 11  SUR3                189 non-null    object 
 12  UR                  189 non-null    object 
 13  FR                  189 non-null    object 
 14  Green frogs         189 non-null    object 
 15  Brown frogs         189 non-null    object 
 16  Common t

In [14]:
new_data.describe()

Unnamed: 0,CR,MR,OR,NR,SR,BR,RR
count,189.0,189.0,189.0,189.0,189.0,189.0,189.0
mean,1.015873,0.047619,89.962963,1.566138,9633.227513,2.502646,2.333333
std,0.125316,0.296894,19.904926,1.544419,46256.078309,2.640971,2.520132
min,1.0,0.0,25.0,1.0,30.0,0.0,0.0
25%,1.0,0.0,99.0,1.0,300.0,1.0,1.0
50%,1.0,0.0,100.0,1.0,700.0,1.0,1.0
75%,1.0,0.0,100.0,1.0,3300.0,5.0,5.0
max,2.0,2.0,100.0,12.0,500000.0,10.0,10.0


ID -> Integer
Motorway -> Categorical
SR -> Numerical
NR -> Numerical
TR -> Categorical
VR -> Categorical
SUR1 -> Categorical
SUR2 -> Categorical
SUR3 -> Categorical
UR -> Categorical
FR -> Categorical
OR -> Numerical
RR -> Ordinal;
BR -> Ordinal;
MR -> Categorical
CR -> Categorical
Green frogs -> Categorical
Brown frogs -> Categorical
Common toad -> Categorical
Fire-bellied toad -> Categorical
Tree frog -> Categorical
Common newt -> Categorical
Great crested newt -> Categorical

we will now divide the data into its features and target values

In [15]:
y=new_data.iloc[:,14:21]
X=new_data.iloc[:,0:14]

In [16]:
print(X)
print(y)

      CR   MR     OR   NR      SR  BR  RR  TR VR SUR1 SUR2 SUR3 UR FR
1    1.0  0.0   50.0  1.0   600.0   0   0   1  4    6    2   10  0  0
2    1.0  0.0   75.0  1.0   700.0   1   1   5  1   10    6   10  3  1
3    1.0  0.0   75.0  1.0   200.0   1   1   5  1   10    6   10  3  4
4    1.0  0.0   25.0  1.0   300.0   0   0   5  0    6   10    2  3  4
5    1.0  0.0   99.0  2.0   600.0   5   0   1  4   10    2    6  0  0
..   ...  ...    ...  ...     ...  ..  ..  .. ..  ...  ...  ... .. ..
185  1.0  0.0   75.0  1.0  2300.0   1   2  12  3    2    2    1  0  0
186  1.0  0.0  100.0  1.0   300.0   5   5  14  2    7   10    2  0  0
187  1.0  0.0  100.0  1.0   500.0   5   5   1  4    1   10    2  0  0
188  1.0  0.0  100.0  1.0   300.0   0   1  12  3    2    1    6  0  0
189  1.0  0.0  100.0  1.0   300.0   1   1  12  3    2    6   10  0  0

[189 rows x 14 columns]
    Green frogs Brown frogs Common toad Fire-bellied toad Tree frog  \
1             0           0           0                 0       

We will proceed to get the labels. These are the different amphibians species living in the different reservoirs

In [17]:
labels = list(new_data.columns.values)
labels = labels[14:]
print(labels)

['Green frogs', 'Brown frogs', 'Common toad', 'Fire-bellied toad', 'Tree frog', 'Common newt', 'Great crested newt']


We will proceed to divide the data into training and testing data

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.30, shuffle=True)

print(X_train.shape)
print(X_test.shape)

(132, 14)
(57, 14)


In [19]:
train_text = X_train.iloc[:,0:14]
test_text = X_test.iloc[:,0:14]

Dimension Reduction with PCA

In [20]:
from sklearn.decomposition import PCA

In [21]:
pca=PCA(n_components=2)
pca.fit(train_text)
train=pca.transform(train_text)

In [22]:
print(train.shape)

(132, 2)


In [23]:
pca=PCA(n_components=2)
pca.fit(test_text)
test=pca.transform(test_text)

In [24]:
print(test.shape)

(57, 2)


In [25]:
from sklearn.metrics import confusion_matrix

applying logistic regression and Random classifier

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [27]:
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for label in labels:
    print('**Processing {} ...**'.format(label))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(train, y_train[label])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[label], prediction)))
    print("\n")

**Processing Green frogs ...**
Test accuracy is 0.5614035087719298


**Processing Brown frogs ...**
Test accuracy is 0.38596491228070173


**Processing Common toad ...**
Test accuracy is 0.5614035087719298


**Processing Fire-bellied toad ...**
Test accuracy is 0.7192982456140351


**Processing Tree frog ...**
Test accuracy is 0.5964912280701754


**Processing Common newt ...**
Test accuracy is 0.7192982456140351


**Processing Great crested newt ...**
Test accuracy is 0.8070175438596491




In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


In [29]:
# applying Random classifier
rdclass=RandomForestClassifier()
for label in labels:
    print('**Processing {} ...**'.format(label))
    
    # Training Random forest model on train data
    rdclass.fit(train, y_train[label])
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[label], prediction)))
    print("\n")
    

**Processing Green frogs ...**
Test accuracy is 0.5614035087719298


**Processing Brown frogs ...**
Test accuracy is 0.38596491228070173


**Processing Common toad ...**
Test accuracy is 0.5614035087719298


**Processing Fire-bellied toad ...**
Test accuracy is 0.7192982456140351


**Processing Tree frog ...**
Test accuracy is 0.5964912280701754


**Processing Common newt ...**
Test accuracy is 0.7192982456140351


**Processing Great crested newt ...**
Test accuracy is 0.8070175438596491




### Observation

We have produced accuracies from the 7 target values using both the Random forest classifier and the Logistic regression classifier. Based on this, we can see that Great crested newt produced he greatest accuracy of 80% when using both the Logistic regression and  Random tree classifier