In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

Load data

In [None]:
MAIN_DATA_DIR = "data"
MAIN_DATA_FILE = "CW1_train.csv"
DATA_PATH = f"{MAIN_DATA_DIR}/{MAIN_DATA_FILE}"

In [None]:
EPS = 1e-8
REPRODUCIBILITY_SEED = 42

In [None]:
data = pd.read_csv(DATA_PATH)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
# Find columns
all_columns = data.columns.tolist()
print(all_columns)

numeric_columns = data.select_dtypes(include=["number"]).columns.tolist()
numeric_columns.remove("outcome") # Remove the target column
print(numeric_columns)

Checking for missing values

In [None]:
for column in all_columns:
    num_nan = data[column].isna().sum()
    num_none = data[column].isnull().sum()
    print(f"{column} | Number of NaN: {num_nan} | Number of None: {num_none}")

Data statistics

In [None]:
def print_statistics(data:pd.DataFrame, column:str):
    """
    Calculates and prints the mean, median, standard deviation,
    minimum and maximum values of a column in a DataFrame.

    Args:
        data (pd.DataFrame): The DataFrame containing the data.
        column (str): The name of the column to calculate the statistics for.
    """

    mean = data.mean()
    median = data.median()
    std = data.std()
    min_val = data.min()
    max_val = data.max()

    print(f"Statistics for column: {column}")
    print(f"Mean: {mean}")
    print(f"Median: {median}")
    print(f"Standard Deviation: {std}")
    print(f"Minimum Value: {min_val}")
    print(f"Maximum Value: {max_val}")
    print()

In [None]:
# for column in numeric_columns:
#     print_statistics(data[column], column)

Data visualisation

In [None]:
def plot_distribution(data:pd.DataFrame, column:str, title:str) -> None:
    """
    Plots the distribution of values in a column of a DataFrame.

    Args:
        data (pd.DataFrame): The DataFrame containing the data to plot.
        column (str): The column to plot.
        title (str): The title of the plot.
    """
    data.plot.hist(title=title)
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# for column in numeric_columns:
#     plot_distribution(data[column], column, f"Distribution of {column}")

In [None]:
plot_distribution(data["outcome"], "outcome", "Distribution of outcome (target variable)")

Data splitting

In [None]:
# Split the data into training, validation and test sets
training_and_val_data, test_data = train_test_split(data, test_size=0.2, random_state=REPRODUCIBILITY_SEED)
training_data, val_data = train_test_split(training_and_val_data, test_size=0.2, random_state=REPRODUCIBILITY_SEED)

Data normalisation (using mean, std standardisation)

In [None]:
for column in numeric_columns:
    mean = training_data[column].mean()
    std = training_data[column].std()
    print("Before standardization:")
    plot_distribution(training_data[column], column, f"Distribution of column '{column}'")
    print_statistics(training_data[column], column)
    
    print("After standardization:")
    training_data[column] = (training_data[column] - mean) / (std + 1e-8)
    plot_distribution(training_data[column], column, f"Standardized distribution of column '{column}'")
    print_statistics(training_data[column], column)