<a href="https://colab.research.google.com/github/LeshibaAshley/Public-Data/blob/master/Life_on_Land_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Life on Land Dataset

### Imports


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import matplotlib.pyplot as plt

### Import the dataset

In [20]:
df = pd.read_csv('/content/.config/configurations/SDG_15_Life_on_Land_Dataset.csv')

In [25]:
df.dropna(inplace=True)

In [24]:
# Load the data
train_data = pd.read_csv ('/content/.config/configurations/SDG_15_Life_on_Land_Dataset.csv')
test_data = pd.read_csv('/content/.config/configurations/SDG_15_Life_on_Land_Dataset.csv')

# Display the first 5 rows of each dataset
print("First 5 rows of train_data:")
print(train_data.head())

print("\nFirst 5 rows of test_data:")
print(test_data.head())

First 5 rows of train_data:
   Year  WaterQualityIndex  ClimateChangeImpactScore  LandUseChange  \
0  2010                 35                  7.542535       6.630073   
1  2010                 13                  1.617642       6.477132   
2  2010                 10                  0.904817       9.069428   
3  2010                 36                  7.060190       7.061932   
4  2010                 24                  8.439246       4.504786   

   InvasiveSpeciesCount  ConservationFunding  EcoTourismImpact  \
0                    39                   46          1.193810   
1                    45                   18          8.484718   
2                    28                   59          3.412444   
3                    37                   88          9.718453   
4                    41                   88          9.118186   

   ForestCoverChange  SoilQualityIndex  WaterUsage  RenewableEnergyUsage  \
0           0.032204                70          33                    13

In [22]:
df.head()

Unnamed: 0,Year,WaterQualityIndex,ClimateChangeImpactScore,LandUseChange,InvasiveSpeciesCount,ConservationFunding,EcoTourismImpact,ForestCoverChange,SoilQualityIndex,WaterUsage,RenewableEnergyUsage,CarbonEmissionLevels,AgriculturalIntensity,HabitatConnectivity,SpeciesReintroductionEfforts,PollinatorDiversity,BiodiversityHealthIndex
0,2010,35,7.542535,6.630073,39,46,1.19381,0.032204,70,33,13,22,1.419332,1.671324,9.311312,0.270434,0.194332
1,2010,13,1.617642,6.477132,45,18,8.484718,-4.803485,69,67,57,63,3.048794,5.815305,4.698086,0.729916,0.525779
2,2010,10,0.904817,9.069428,28,59,3.412444,-2.563852,72,85,37,46,7.863218,9.543694,2.080495,0.068508,0.684795
3,2010,36,7.06019,7.061932,37,88,9.718453,3.425382,64,85,96,60,1.065595,4.938359,2.192657,0.145065,0.926146
4,2010,24,8.439246,4.504786,41,88,9.118186,-1.295086,12,0,85,61,9.785784,2.114002,7.405189,0.809798,0.126345


In [23]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Year                          1000 non-null   int64  
 1   WaterQualityIndex             1000 non-null   int64  
 2   ClimateChangeImpactScore      1000 non-null   float64
 3   LandUseChange                 1000 non-null   float64
 4   InvasiveSpeciesCount          1000 non-null   int64  
 5   ConservationFunding           1000 non-null   int64  
 6   EcoTourismImpact              1000 non-null   float64
 7   ForestCoverChange             1000 non-null   float64
 8   SoilQualityIndex              1000 non-null   int64  
 9   WaterUsage                    1000 non-null   int64  
 10  RenewableEnergyUsage          1000 non-null   int64  
 11  CarbonEmissionLevels          1000 non-null   int64  
 12  AgriculturalIntensity         1000 non-null   float64
 13  Habi

# Data Cleaning

In [13]:
df.isna().sum()

Unnamed: 0,0
Year,0
WaterQualityIndex,0
ClimateChangeImpactScore,0
LandUseChange,0
InvasiveSpeciesCount,0
ConservationFunding,0
EcoTourismImpact,0
ForestCoverChange,0
SoilQualityIndex,0
WaterUsage,0


In [26]:
# Get the shape of the train_data DataFrame
print('The shape of the train_data:', train_data.shape)

# Get the shape of the test_data DataFrame
print('The shape of the test_data:', test_data.shape)

The shape of the train_data: (1000, 17)
The shape of the test_data: (1000, 17)


In [27]:
# Display summary information about the train_data and test_data
print(train_data.info(), test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Year                          1000 non-null   int64  
 1   WaterQualityIndex             1000 non-null   int64  
 2   ClimateChangeImpactScore      1000 non-null   float64
 3   LandUseChange                 1000 non-null   float64
 4   InvasiveSpeciesCount          1000 non-null   int64  
 5   ConservationFunding           1000 non-null   int64  
 6   EcoTourismImpact              1000 non-null   float64
 7   ForestCoverChange             1000 non-null   float64
 8   SoilQualityIndex              1000 non-null   int64  
 9   WaterUsage                    1000 non-null   int64  
 10  RenewableEnergyUsage          1000 non-null   int64  
 11  CarbonEmissionLevels          1000 non-null   int64  
 12  AgriculturalIntensity         1000 non-null   float64
 13  Habi

In [32]:
# counting Null value
def check_null_values(train_data):
    """
    Print the count of null values for each column in a DataFrame.

    This function iterates through each column in the DataFrame to check for the presence of null values.
    If a column contains null values, it prints the column name along with the number of null values.

    Parameters:
    df (DataFrame): The pandas DataFrame to check for null values.

    Returns:
    None: This function does not return a value; it only prints information.
    """
    has_nulls = False
    for column in train_data:
        null_count = train_data[column].isnull().sum()
        if null_count > 0:
            print(f'{column} has {null_count} null values')
            has_nulls = True

    if not has_nulls:
        print('There are no null values in the train_data')

# Printing null value count
check_null_values(train_data)


There are no null values in the train_data


In [33]:
# counting Null value
def check_null_values(test_data):
    """
    Print the count of null values for each column in a DataFrame.

    This function iterates through each column in the DataFrame to check for the presence of null values.
    If a column contains null values, it prints the column name along with the number of null values.

    Parameters:
    df (DataFrame): The pandas DataFrame to check for null values.

    Returns:
    None: This function does not return a value; it only prints information.
    """
    has_nulls = False
    for column in test_data:
        null_count = test_data[column].isnull().sum()
        if null_count > 0:
            print(f'{column} has {null_count} null values')
            has_nulls = True

    if not has_nulls:
        print('There are no null values in the test_data')

# Printing null value count
check_null_values(test_data)

There are no null values in the test_data


In [34]:
# counting duplicate rows
def count_duplicate_rows(train_data):
    """
    Count the number of duplicate rows in a DataFrame.

    This function calculates the total number of duplicate rows in the DataFrame by calling the `duplicated` method,
    which marks duplicates as `True`, and then sums these cases.

    Parameters:
    test_data (pandas.DataFrame): The DataFrame to check for duplicates.

    Returns:
    int: The count of duplicate rows.
    """
    duplicate_count = train_data.duplicated().sum()
    return duplicate_count

# printing duplicate rows count
duplicate_count = count_duplicate_rows(train_data)
print(f"The number of duplicate rows in the train_data: {duplicate_count}")

The number of duplicate rows in the train_data: 0


In [35]:
# counting duplicate rows
def count_duplicate_rows(test_data):
    """
    Count the number of duplicate rows in a DataFrame.

    This function calculates the total number of duplicate rows in the DataFrame by calling the `duplicated` method,
    which marks duplicates as `True`, and then sums these cases.

    Parameters:
    test_data (pandas.DataFrame): The DataFrame to check for duplicates.

    Returns:
    int: The count of duplicate rows.
    """
    duplicate_count = test_data.duplicated().sum()
    return duplicate_count

# printing duplicate rows count
duplicate_count = count_duplicate_rows(test_data)
print(f"The number of duplicate rows in the test_data: {duplicate_count}")

The number of duplicate rows in the test_data: 0


In [31]:
# Check for duplicate rows
duplicates = df.duplicated()
print("Number of duplicate rows:", duplicates.sum())

Number of duplicate rows: 0


# Pre-Processing


In [38]:
# Download NLTK data
!pip install nltk
import nltk # import the nltk library
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [46]:
# Tokenize text using TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer # import the TreebankWordTokenizer class from nltk.tokenize

tokenizer = TreebankWordTokenizer()

# Check for the correct column name and replace 'text' if necessary
# Ensure the column containing the text data is correctly referenced
train_data["correct_column_name"] = train_data["correct_column_name"].apply(tokenizer.tokenize)

# Display the tokenized text
print(train_data["correct_column_name"])

KeyError: 'correct_column_name'

In [47]:
# Tokenize text using TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

# Replace 'actual_column_name' with the correct column name from your DataFrame
train_data["tokenized_text"] = train_data["actual_column_name"].apply(tokenizer.tokenize)

# Display the tokenized text
print(train_data["tokenized_text"])

KeyError: 'actual_column_name'