# Homework 1

## Dependencies

In [None]:
import csv
from matplotlib import pyplot as plt
import seaborn as sns
import typing as t
import numpy as np
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
class Person:
    Gender: str
    Age: int
    Height: float
    Weight: float
    family_history_with_overweight: str
    FAVC: str
    FCVC: int
    NCP: int
    CAEC: str
    SMOKE: str
    CH2O: int
    SCC: str
    FAF: str
    TUE: int
    CALC: str
    MTRANS: str
    NObeyesdad: str

    def __init__(
        self,
        Gender: str,
        Age: int,
        Height: float,
        Weight: float,
        family_history_with_overweight: str,
        FAVC: str,
        FCVC: int,
        NCP: int,
        CAEC: str,
        SMOKE: str,
        CH2O: int,
        SCC: str,
        FAF: int,
        TUE: int,
        CALC: str,
        MTRANS: str,
        NObeyesdad: str,
    ):
        self.Gender = Gender
        self.Age = Age
        self.Height = Height
        self.Weight = Weight
        self.family_history_with_overweight = family_history_with_overweight
        self.FAVC = FAVC
        self.FCVC = FCVC
        self.NCP = NCP
        self.CAEC = CAEC
        self.SMOKE = SMOKE
        self.CH2O = CH2O
        self.SCC = SCC
        self.FAF = FAF
        self.TUE = TUE
        self.CALC = CALC
        self.MTRANS = MTRANS
        self.NObeyesdad = NObeyesdad

    def __str__(self):
        return (
            "{"
            + f'"Gender": "{self.Gender}",'
            + f'"Age": {self.Age},'
            + f'"Height": {self.Height},'
            + f'"Weight": {self.Weight},'
            + f'"family_history_with_overweight": "{self.family_history_with_overweight}",'
            + f'"FAVC": "{self.FAVC}",'
            + f'"FCVC": {self.FCVC},'
            + f'"NCP": {self.NCP},'
            + f'"CAEC": "{self.CAEC}",'
            + f'"SMOKE": "{self.SMOKE}",'
            + f'"CH2O": {self.CH2O},'
            + f'"SCC": {self.SCC},'
            + f'"FAF": "{self.FAF}",'
            + f'"TUE": {self.TUE},'
            + f'"CALC": "{self.CALC}",'
            + f'"MTRANS": "{self.MTRANS}",'
            + f'"NObeyesdad": "{self.NObeyesdad}"'
            + "}"
        )
    
    def __len__(self):
        return 17
    
    __repr__ = __str__

In [None]:
numeric_variables = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
categorical_values = ["FAVC", "CAEC", "CALC", "SCC", "MTRANS", "Gender", "family_history_with_overweight", "SMOKE", "NObeyesdad"]

In [None]:
class DatasetManager:
    def __init__(self, path_to_csv: str):
        self.path_to_csv = path_to_csv

    def load_as_obj_list(self) -> list[Person]:
        with open(self.path_to_csv) as csv_file:
            csv_reader = csv.DictReader(csv_file)
            return [Person(**row) for row in csv_reader]

    @staticmethod
    def obj_list_to_np_array(data: list[Person], attrs_list: list[str] = numeric_variables + categorical_values) -> np.array:
        return np.array([[getattr(entry, field) for field in attrs_list] for entry in data])
    
    @staticmethod
    def obj_list_to_list(data: t.List[Person], attrs_list: t.List[str] = categorical_values) -> t.List[t.List[str]]:
        return [[getattr(entry, field) for field in attrs_list] for entry in data]
    
    @staticmethod
    def obj_list_to_flat_list(data: t.List[Person], attrs_list: t.List[str] = categorical_values) -> t.List[str]:
        return [getattr(entry, field) for field in attrs_list for entry in data]


In [None]:
dataset_manager = DatasetManager("data/ObesityDataSet.csv")
dataset_obj_list = dataset_manager.load_as_obj_list()


## Univariate analysis

The following will be applied:
1. central tendency
1. spread
1. distribution form (skewness, kurtosis)
1. frequency of categorial data
1. graphs
    1. histograms
    1. density
    1. boxplots

### 1. Central tendency

Calculates mean, median and mode for each data series.

In [None]:
def calculate_central_tendency_numerical(np_dataset: np.array) -> t.Tuple[float, float, float]:
    mean = np.mean(np_dataset)
    median = np.median(np_dataset)
    mode = stats.mode(np_dataset).mode

    return mean, median, mode


In [None]:
for numerical_var in numeric_variables:
    dataset_for_numerical_val = (
        DatasetManager.obj_list_to_np_array(dataset_obj_list, [numerical_var])
        .astype(np.float64)
        .reshape(-1)
    )
    mean, median, mode = calculate_central_tendency_numerical(dataset_for_numerical_val)

    print(f"On numerical var {numerical_var}")
    print(f"Mean: {mean}")
    print(f"Median: {median}")
    print(f"Mode: {mode}\n{['-' * 10]}")
    # MODE nu cred ca e si la variabile numerice

### 2. Spread

Calculates the spread of data for each data series. Useful to know wether the data has a "central tendency".

In [None]:
def calculate_spread_numerical(np_dataset: np.array) -> t.Tuple[float, float, float]:
    dataset_range = np.ptp(np_dataset)
    dataset_variance = np.var(np_dataset)
    dataset_standard_deviation = np.std(np_dataset)

    return dataset_range, dataset_variance, dataset_standard_deviation

In [None]:
for numerical_var in numeric_variables:
    dataset_for_numerical_val = DatasetManager.obj_list_to_np_array(dataset_obj_list, [numerical_var]).astype(np.float64).reshape(-1)
    dataset_range, dataset_variance, dataset_standard_deviation = calculate_spread_numerical(dataset_for_numerical_val)
    
    print(f"On numerical var {numerical_var}")
    print(f"Range: {dataset_range}")
    print(f"Variance: {dataset_variance}")
    print(f"Standard deviation: {dataset_standard_deviation}")
    print(f"{['-' * 10]}")

### 3. Skewness, kurtosis

Calculates skewness and kurtosis of the dataset.

Meaning for skewness:
* Positively skewed (right-skewed):
    * The distribution is positively skewed **if the distribution's tail on the right side is longer or "fatter" than the left side**. This means that there are more data points on the left side, and the distribution as a longer right tail.
    * Values: `> 1`
* Negatively skewed (left-skewed):
    * The distribution is negatively skewed **if the distribution's tail on the left side is longer or "fatter" than the right side**. This means that there are more data points on the right side, and the distribution as a longer left tail.
    * Values: `< -1`
* Symmetric: 
    * If the distribution is _roughly_ the same on both sides, it is symmetric, and the skewness is close to 0.
    * Values: `~ 0`

Meaning of kurtosis:
* Mesokurtic (Normal distribution):
    * A distribution with kurtosis similar to that of a normal distribution
    * Values: `~ 0`
* Leptokurtic:
    * A distribution with pisitive kurtosis, indicating heavier tails and a more peaked central region compared to a normal distribution
    * Values: `> 1`
* Platykurtic:
    * A normal distribution with a negative kurtosis, indicating lighter tails and a flatter central region compared to a normal distribution
    * Values: `< -1`

In [None]:
def calculate_skewness_kurtosis_numerical(np_dataset: np.array) -> t.Tuple[float, float]:
    dataset_skewness = stats.skew(np_dataset)
    dataset_kurtosis = stats.kurtosis(np_dataset)

    return dataset_skewness, dataset_kurtosis

In [None]:
for numerical_var in numeric_variables:
    dataset_for_numerical_val = DatasetManager.obj_list_to_np_array(dataset_obj_list, [numerical_var]).astype(np.float64).reshape(-1)
    dataset_skewness, dataset_kurtosis = calculate_skewness_kurtosis_numerical(dataset_for_numerical_val)
    
    print(f"On numerical var {numerical_var}")
    print(f"Skewness: {dataset_skewness}")
    print(f"Kurtosis: {dataset_kurtosis}")
    print(f"{['-' * 10]}\n")

### 4. Frequency of categorial data

Here we count how often we see the categorial data in a data series.

In [None]:
def calculate_frequency_of_data_categorial(dataset: t.List[str]) -> t.Dict[str, int]:
    counts = {}

    for entry in dataset:
        if entry in counts:
            counts[entry] += 1
        else:
            counts[entry] = 1
    
    return counts


    

In [None]:
for categorial_var in categorical_values:
    dataset_for_categorial_val = DatasetManager.obj_list_to_flat_list(dataset_obj_list, [categorial_var])
    dataset_frequency = calculate_frequency_of_data_categorial(dataset_for_categorial_val)

    print(f"On categorial var {categorial_var}")
    
    for entry in dataset_frequency:
        print(f"Frequency of value \"{entry}\": {dataset_frequency[entry]}")
    
    print(f"{['-' * 10]}")

    

### 5. Graphs

Here we can find histograms, density charts and boxplots.

#### 5.1. Histograms

Histograms plot how frequently we meet a data entry from the dataset.

In [None]:
for numerical_var in numeric_variables:
    dataset_for_numerical_val = DatasetManager.obj_list_to_np_array(dataset_obj_list, [numerical_var]).astype(np.float64).reshape(-1)

    plt.figure()
    plt.hist(dataset_for_numerical_val, bins=30, color='lightblue', edgecolor='black', alpha=0.7)
    plt.title(f"Histogram for numerical variable \"{numerical_var}\"")

In [None]:
for categorial_var in categorical_values:
    dataset_for_categorial_val = DatasetManager.obj_list_to_flat_list(dataset_obj_list, [categorial_var])

    dataset_frequency = calculate_frequency_of_data_categorial(dataset_for_categorial_val)
    dataset_keys = [key for key in dataset_frequency]
    dataset_values = [dataset_frequency[key] for key in dataset_frequency]

    plt.figure()
    plt.bar(dataset_keys, dataset_values, color='lightblue', edgecolor='black', alpha=0.7)
    plt.title(f"Histogram for categorial variable \"{categorial_var}\"")

#### 5.2. Density charts

Density charts plot how frequently we meet a data entry from the dataset and what distribution they follow.

In [None]:
for numerical_var in numeric_variables:
    dataset_for_numerical_val = DatasetManager.obj_list_to_np_array(dataset_obj_list, [numerical_var]).astype(np.float64).reshape(-1)
 
    plt.figure()
    sns.kdeplot(dataset_for_numerical_val, bw=0.1)
    plt.hist(dataset_for_numerical_val, bins=30, density=True, color='lightblue', edgecolor='black', alpha=0.4)

    plt.xlabel('Values')
    plt.ylabel('Density')
    plt.title(f"Density Chart for \"{numerical_var}\"")

#### 5.3. Boxplots

Boxplots charts show how the data "behaves":
* min
* max
* quantiles
* outliers
* median
* inter-quartile range (contains 50% of the data)
* Skewness of data
* Robustness to extreme values
* etc.

In [None]:
for numerical_var in numeric_variables:
    dataset_for_numerical_val = DatasetManager.obj_list_to_np_array(dataset_obj_list, [numerical_var]).astype(np.float64).reshape(-1)

    plt.figure()
    plt.boxplot(dataset_for_numerical_val, labels=[numerical_var])
    plt.xlabel("Group")
    plt.ylabel("Values")
    plt.title(f"Boxplot for \"{numerical_var}\"")

## Bivariate/multivariate analysis

The following will be applied:
1. correlations between data series
1. independence test
1. medium-test between populations
1. some visualisations:
    1. scatter-plots
    1. 3D graphs
    1. scatter-plots on main components
    1. non-linear mappings in 2d space: Sammon, t-SNE, uMap
    1. "projection pursuit" methodologies
    1. conditional boxplots
    1. overlaid histograms
    1. corrgrams 