# Creating some functions for data filtering/manipulation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
from pandas.plotting import parallel_coordinates
from typing import List

In [2]:
# loading the data set
df_01: pd.DataFrame = pd.read_csv("datasets\climate_change_indicators.csv")

A function that generates a matrix of missing data (NaN)

In [3]:
def missing_data_from(dataframe: pd.DataFrame) -> msno.matrix:
    return msno.matrix(dataframe)

A function to remove the instances with missing data

In [4]:
def remove_nan_rows_from(dataframe: pd.DataFrame) -> pd.DataFrame:
    return dataframe.dropna()

A function to remove all object type columns but the label one

In [5]:
def remove_object_columns_from(dataframe: pd.DataFrame, label_column: str) -> pd.DataFrame:
    object_columns: List = dataframe.select_dtypes(include="object").columns.to_list()
    object_columns.remove(label_column)
    return dataframe.drop(columns=object_columns)

Now a function to reduce the dimension number

In [6]:
def reduce_dimension_number_from(dataframe: pd.DataFrame, dimension_width: int = 9,) -> pd.DataFrame:
    # recovering the number os columns in the dataframe
    n_columns: int = len(dataframe.columns)
    # selecting 3 columns from the beginning, middle and the end of the dataframe
    first_dimension_sample: pd.DataFrame = dataframe.iloc[:, 1 : (dimension_width // 3)] # :,  1 to avoid the label column? is this right?
    second_dimension_sample: pd.DataFrame = dataframe.iloc[
        :, n_columns // 2 : (n_columns // 2) + (dimension_width // 3)
    ]
    third_dimension_sample: pd.DataFrame = dataframe.iloc[:, -(dimension_width // 3) :]
    # joining the pieces together into a single dataframe
    reduced_dimension_df: pd.DataFrame = pd.concat(
        [first_dimension_sample, second_dimension_sample, third_dimension_sample],
        axis=1,
    )
    
    return reduced_dimension_df

A function that checks if the 1st column is the label one (object type)

In [7]:
def is_label_spot_correct(dataframe: pd.DataFrame) -> bool:
    if dataframe.dtypes[0] == 'object':
        return True
    
    return False

In [8]:
# def normalize_data_from(dataframe: pd.DataFrame) -> pd.DataFrame:
#     numeric_columns = dataframe.select_dtypes(include=['float64', 'int64']).columns
#     return (dataframe[numeric_columns] - dataframe[numeric_columns].mean()) / dataframe[numeric_columns].std()

In [9]:
# df_02: pd.DataFrame = pd.read_csv("datasets\climate_change_indicators.csv")
# aux_dataframe = normalize_data_from(df_02)
# aux_dataframe.head()

In [14]:
def normalize_data_from(dataframe: pd.DataFrame) -> pd.DataFrame:
    aux_dataframe = dataframe.copy()  # Faz uma cópia do DataFrame original
    numeric_columns = aux_dataframe.select_dtypes(include=['float64', 'int64']).columns
    aux_dataframe[numeric_columns] = (aux_dataframe[numeric_columns] - aux_dataframe[numeric_columns].min()) / (
        aux_dataframe[numeric_columns].max() - aux_dataframe[numeric_columns].min()
    )
    return aux_dataframe 

In [15]:
df_03: pd.DataFrame = pd.read_csv("datasets\climate_change_indicators.csv")
aux_dataframe = normalize_data_from(df_03)
aux_dataframe.head()

Unnamed: 0,ObjectId,Country,ISO2,ISO3,Indicator,Unit,Source,CTS_Code,CTS_Name,CTS_Full_Descriptor,...,F2013,F2014,F2015,F2016,F2017,F2018,F2019,F2020,F2021,F2022
0,0.0,"Afghanistan, Islamic Rep. of",AF,AFG,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,0.762623,0.195994,0.500493,0.590765,0.615105,0.515391,0.325881,0.077701,0.564979,0.729332
1,0.004464,Albania,AL,ALB,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,0.796721,0.461373,0.656918,0.54957,0.44588,0.706393,0.615764,0.366551,0.632377,0.620712
2,0.008929,Algeria,DZ,DZA,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,0.704262,0.637339,0.509694,0.682209,0.603796,0.383583,0.403562,0.490179,0.888423,0.658091
3,0.013393,American Samoa,AS,ASM,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,0.746885,0.451359,0.472889,0.583522,0.572698,0.375296,0.564229,0.346909,0.545953,0.563105
4,0.017857,"Andorra, Principality of",AD,AND,Temperature change with respect to a baseline ...,Degree Celsius,Food and Agriculture Organization of the Unite...,ECCS,Surface Temperature Change,"Environment, Climate Change, Climate Indicator...",...,0.467541,0.728898,0.696681,0.787687,0.770598,0.663378,0.725275,0.673888,0.631409,1.0
