In [None]:
from ..features import build_features 
from ..encoders import encoders 
import seaborn as sns
from sklearn.model_selection import train_test_split
import pandas
import pickle
import os

Defining constants

In [None]:
DATA_PATH = "../data/"

Loading raw dataset

In [None]:
applications = pickle.load(open(os.path.join(DATA_PATH, "compressed_data/category_dataset.pkl")))

Correlation Matrix

In [None]:
sns.heatmap(
    applications.corr()
)

Splitting dataset

In [None]:
X, Y = applications.drop(columns=['category']), applications['category']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
x_test, x_validation, y_test, y_validation = train_test_split(x_train, y_train, test_size=0.4)


training_set = pandas.concat([x_train, y_train], axis=1)
validation_set = pandas.concat([x_validation, y_validation], axis=1)
testing_set = pandas.concat([x_test, y_test], axis=1)

Detecting anomalies

In [None]:
training_set.isna().sum()

In [None]:
validation_set.isna().sum()

In [None]:
testing_set.isna().sum()

Building Features

In [None]:
training_set = build_features.construct_features(training_set)
validation_set = build_features.construct_features(validation_set)
testing_set = build_features.construct_features(testing_set)

Encoding datasets

In [None]:
encoder = encoders.DatasetEncoder()

In [None]:
training_set = encoder.encode_dataset(training_set)
testing_set = encoder.encode_dataset(testing_set)
validation_set = encoder.encode_dataset(validation_set)

Saving datasets

In [None]:
training_set.to_csv(os.path.join(DATA_PATH, 'processed_data/training_set.csv'))
validation_set.to_csv(os.path.join(DATA_PATH, 'processed_data/validation_set.csv'))
testing_set.to_csv(os.path.join(DATA_PATH, 'processed_data/testing_set.csv'))