# Initial Data Analysis

---

This notebook presents the Initial Data Analysis of the following dataset:

In [None]:
import os
DATASET = os.environ.get("DATASET")
DATASET

### Imports

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline

In [None]:
os.chdir('/code')

In [None]:
from constants import PROCESSED_DATA_DIR
from params import params
from randomness import get_random_seed
from ida import describe_column

In [None]:
PARAMS = params[DATASET]
SEED = get_random_seed(DATASET)

### Data loading

We load the **training part** of dataset.

In [None]:
data = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, DATASET + "_train.csv"))
y = data["class_"]
X = data.drop("class_", axis="columns")

In [None]:
data.head(10)

In [None]:
X.head(10)

In [None]:
y[:10]

### General information

In [None]:
n_rows, n_cols = X.shape
n_classes = y.unique().size

print(f"Number of samples in analyzed subset: {n_rows}.")
print(f"Number of features: {n_cols}.")
print(f"Number of classes: {n_classes}.")

In [None]:
sns.countplot(data=data, x="class_")

### Columns description

In [None]:
desc = ''

for column_name in X.columns:
    column = X[column_name]
    desc += f'COLUMN: {column_name}'
    desc += '\n'
    desc += describe_column(column=column, classes=y, dataset_name=DATASET)
    desc += '\n' + '-'*10 + '\n'

print(desc)