# Creating some functions for data filtering/manipulation

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
from pandas.plotting import parallel_coordinates
from typing import List

In [3]:
# loading the data set
df_01: pd.DataFrame = pd.read_csv("datasets\climate_change_indicators.csv")

A function that generates a matrix of missing data (NaN)

In [4]:
def missing_data_from(dataframe: pd.DataFrame) -> msno.matrix:
    return msno.matrix(dataframe)

A function to remove the instances with missing data

In [5]:
def remove_nan_rows_from(dataframe: pd.DataFrame) -> pd.DataFrame:
    return dataframe.dropna()

A function to remove all object type columns but the label one

In [6]:
def remove_object_columns_from(dataframe: pd.DataFrame, label_column: str) -> pd.DataFrame:
    object_columns: List = dataframe.select_dtypes(include="object").columns.to_list()
    object_columns.remove(label_column)
    return dataframe.drop(columns=object_columns)

Now a function to reduce the dimension number

In [7]:
def reduce_dimension_number_from(dataframe: pd.DataFrame, dimension_width: int = 9,) -> pd.DataFrame:
    # recovering the number os columns in the dataframe
    n_columns: int = len(dataframe.columns)
    # selecting 3 columns from the beginning, middle and the end of the dataframe
    first_dimension_sample: pd.DataFrame = dataframe.iloc[:, 1 : (dimension_width // 3)] # :,  1 to avoid the label column? is this right?
    second_dimension_sample: pd.DataFrame = dataframe.iloc[
        :, n_columns // 2 : (n_columns // 2) + (dimension_width // 3)
    ]
    third_dimension_sample: pd.DataFrame = dataframe.iloc[:, -(dimension_width // 3) :]
    # joining the pieces together into a single dataframe
    reduced_dimension_df: pd.DataFrame = pd.concat(
        [first_dimension_sample, second_dimension_sample, third_dimension_sample],
        axis=1,
    )
    
    return reduced_dimension_df

A function that checks if the 1st column is the label one (object type)

In [8]:
def is_label_spot_correct(dataframe: pd.DataFrame) -> bool:
    if dataframe.dtypes[0] == 'object':
        return True
    
    return False

In [9]:
# df_01 = df_01.drop(columns='ObjectId')
# print(is_label_spot_correct(df_01))