# PART1 DATA EXPLORATION

## Imports

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter
from pprint import pprint
from matplotlib import pyplot as plt
import seaborn as sns
from enum import Enum
from pathlib import Path

sns.set(style="whitegrid")

## Load Data

In [None]:
data_dir = Path('../data')

In [None]:
data_path = data_dir / 'Dataset1.csv'
df = pd.read_csv(data_path)

## Explore the data

In [None]:
df.head(50)

In [None]:
df.info()

In [None]:
def check_weird_values(data):
    for col in data.columns:
        try:
            data[col] = data[col].astype(float)
        except ValueError as err:
            print(f'could not convert data on column "{col}" with error {err}')

In [None]:
check_weird_values(df)

In [None]:
def check_duplicated(data: pd.DataFrame):
    if num_duplicated := sum(data.duplicated()):
        print(f'df has {num_duplicated} duplicated rows')
    else:
        print('data frame has no duplicated rows')

In [None]:
check_duplicated(df)

In [None]:
# Check for empty values
print("\nNumber of empty values in each column:")
print(df.isnull().sum())

# Data Cleaning

In [None]:
def remove_rows_with_errors(input_df: pd.DataFrame) -> pd.DataFrame:
    error_rows = []

    for col in input_df.columns:
        try:
            input_df[col] = input_df[col].astype(float)
        except ValueError as e:
            print(f'could not convert data on column "{col}" with error {e}')
            error_rows.extend(input_df[col][pd.to_numeric(input_df[col], errors='coerce').isna()].index.tolist())

    error_rows = np.unique(error_rows)
    df_cleaned = df.drop(index=error_rows)
    print(f'removed rows are : {error_rows}')

    return df_cleaned

In [None]:
def remove_duplicates_from_dataframe(input_df: pd.DataFrame) -> pd.DataFrame:
    seen_rows = set()
    output_rows = []

    for index, row in input_df.iterrows():
        row_tuple = tuple(row)
        if row_tuple not in seen_rows:
            seen_rows.add(row_tuple)
            output_rows.append(row)

    output_df = pd.DataFrame(output_rows, columns=input_df.columns)
    return output_df

In [None]:
def remove_rows_with_missing_values(input_df: pd.DataFrame) -> pd.DataFrame:
    output_df = input_df[~input_df.isna().any(axis=1)]
    return output_df

In [None]:
def clean_df(input_df: pd.DataFrame) -> pd.DataFrame:
    input_df = remove_rows_with_errors(input_df)
    input_df = remove_duplicates_from_dataframe(input_df)
    input_df = remove_rows_with_missing_values(input_df)
    return input_df

In [None]:
df = clean_df(df)

In [None]:
check_duplicated(df)
check_weird_values(df)

# Data description

In [None]:
def global_describe(input_data):
    """
    Provide a simple global description for a pandas DataFrame.

    Parameters:
    dataframe (pandas.DataFrame): The input DataFrame.

    Returns:
    dict: A dictionary containing the number of rows, number of columns, and data type of each column.
    """
    num_rows = len(input_data)
    num_columns = len(input_data.columns)
    column_types = input_data.dtypes.to_dict()

    global_desc = {
        'num_rows': num_rows,
        'num_columns': num_columns,
        'column_types': column_types,
    }

    return global_desc

In [None]:
pprint(global_describe(df))

In [None]:
def custom_describe(input_df: pd.DataFrame):
    result = {}

    for column in input_df.columns:
        values = input_df[column].tolist()
        sorted_values = sorted(values)
        # Maximum
        max_val = max(values)

        # Minimum
        min_val = min(values)

        # Mean
        mean = sum(values) / len(values)

        # Mode
        counter = Counter(input_df[column])
        mode = counter.most_common(1)[0][0]

        # Median
        n = len(sorted_values)
        if n % 2 == 0:
            median = (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2
        else:
            median = sorted_values[n // 2]

        # Standard Deviation
        std_val = (sum((x - mean) ** 2 for x in values) / len(values)) ** 0.5

        # Quantiles
        quantiles = {
            '25%': sorted_values[int(0.25 * n)],
            '50%': median,
            '75%': sorted_values[int(0.75 * n)]
        }

        result[column] = {
            'max': max_val,
            'min': min_val,
            'mean': mean,
            'mode': mode,
            'median': median,
            'std': std_val,
            'quantiles': quantiles
        }

    return result

In [None]:
pprint(custom_describe(df))

In [None]:
df.describe()

# Data Visualization

In [None]:
num_cols = len(df.columns)
num_rows = (num_cols + 1) // 2

In [None]:
fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(12, 5 * num_rows))
fig.suptitle('Box plots of Data')
axes = axes.flatten()

for i, column in enumerate(df.columns):
    ax = sns.boxplot(y=df[column], ax=axes[i])
    axes[i].set_title(column)


plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(num_rows, 2, figsize=(15, num_rows * 4))
fig.suptitle('Histograms of Data', y=1.02)

axes = axes.flatten()

for i, col in enumerate(df.columns):
    sns.histplot(df[col], ax=axes[i], kde=True)
    axes[i].set_title(f'Histogram for {col}')

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(num_rows, 2, figsize=(15, num_rows * 4))
fig.suptitle('Scatter Plots of Data', y=1.02)

axes = axes.flatten()

for i, col in enumerate(df.columns):
    sns.scatterplot(df[col], ax=axes[i])
    axes[i].set_title(f'Histogram for {col}')

plt.tight_layout()
plt.show()