# 01 - Import


## Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

ModuleNotFoundError: No module named 'seaborn'

In [None]:
DATASET = "train.csv"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Import Data

In [None]:
df = pd.read_csv("orig/" + DATASET)
print(df.shape)
df.head()

In [None]:
df_score = pd.read_csv("orig/test.csv")
print(df_score.shape)
df_score.head()

## Targets and Features Preprocessing

In [None]:
target = 'Target'
features = df.columns.drop(target)

### Target

Drop all rows where the target is missing and set the target column as Category type.

In [None]:
df = df.dropna(subset=[target])
df[target] = pd.Categorical(df[target])

### Object type features

Features of type `object` are text based and after inspection we can see it is a number value in the format "{Column_Name}_{Number}". We can remove the column name and keep the number value.

In [None]:
object_features = [col for col in features if df[col].dtype == 'object' and col != target]

for col in object_features:
    # Apply changes to the train dataset
    df[col] = df[col].str.replace(f"{col}_", "")
    df[col] = pd.to_numeric(df[col])

    # Apply changes to the score dataset
    df_score[col] = df_score[col].str.replace(f"{col}_", "")
    df_score[col] = pd.to_numeric(df_score[col])

### Categorical Features

Set columns that have less than 10 unique values as categorical features.

In [None]:
categorical_features = [col for col in features if df[col].nunique() < 10 and col != target]

for col in categorical_features:
    # Apply changes to the train dataset
    df[col] = pd.Categorical(df[col])

    # Apply changes to the score dataset
    df_score[col] = pd.Categorical(df_score[col])

### Numerical Features

Set all the other columns that are not categorical as numerical features.

In [None]:
numerical_features =  [col for col in features if col not in categorical_features and col != target]

for col in numerical_features:
    # Apply changes to the train dataset
    df[col] = pd.to_numeric(df[col])

    # Apply changes to the score dataset
    df_score[col] = pd.to_numeric(df_score[col])

## Save Features and Target

In [None]:
with open(ROOT + 'data/features.yaml', 'w') as file:
    documents = yaml.dump({
        "target": target,
        "features": features.tolist(),
        "categorical_features": categorical_features,
        "numerical_features": numerical_features,
    }, file)

## Save Data

Save this raw df with the data as a pickle file for later use (EDA, Preprocessing, feature engineering, modeling).

In [None]:
df.to_pickle(ROOT + "data/df_orig.pkl")

## Scoring Data

Save to a pickle file for later use.

In [None]:
df_score.to_pickle(ROOT + "data/df_score.pkl")

# Notes