# Imports and library functions

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dataclasses import dataclass
from functools import reduce

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pandas import DataFrame
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import plot_tree
from ydata_profiling import ProfileReport

from lib.chartSpecificData import survivalFrame
from lib.clean import cleanAge, cleanEmbarked, cleanFare, dropIrrelevant
from lib.featureEngineering import addFamilyCountData, addIsAdult
from lib.numericConversion import (SexConversion, embarkedConverter,
                                   sexConverter)

### matplotlib options

This is just because my local setup is weird

In [None]:
matplotlib.use("ipympl")
plt.ioff()

## Compose
a function that enables functional composition
compose :: function, function, ... -> function

compose(f, g, h, i)(x) is equivalent to i(h(g(f(x))))

In [None]:
def compose(*funcs):
    return reduce(lambda f, g: lambda x: g(f(x)), funcs, lambda x: x)

## Import The Data

In [None]:

data: DataFrame = pd.read_csv(
    "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"
)

# Data Exploration part 1

### Get the first few rows

In [None]:
data.head()

### Get the last few rows

In [None]:
data.tail()

### Get summary, data types, that sort of thing

In [None]:
data.info()

### Get descriptive statistics

In [None]:
data.describe()

### Get number of nulls

In [None]:
data.isnull().sum()

### ydataprofiler

In [None]:
# ProfileReport(data, title="Titanic Profiling Report")

# CCXZX%% data Preprocessing [markdown]
"""
# Data Preprocessing

Here we are just getting rid of null values and dropping irrelevant data that we don't need
"""

In [None]:
cleanData = compose(cleanAge, cleanFare, cleanEmbarked, dropIrrelevant)
convertDataToNumeric = compose(sexConverter, embarkedConverter)

processedData = compose(cleanData, convertDataToNumeric)(data)
processedDataRows = len(processedData)

# Data Exploration Part II and Data Visualization