In [1]:
import sys
sys.path.append("../")
sys.path.append("../../../")

In [2]:
from features import build_features
from sklearn.model_selection import train_test_split
import pandas
import pickle
import os
from text_classification import text_encoding

loading datasets..
encoding datasets..
saving datasets..


Defining constants

In [3]:
DATA_PATH = "../data"

Loading raw dataset

In [4]:
applications = pickle.load(open(os.path.join(DATA_PATH, "compressed_data/category_dataset.pkl"), mode='rb'))

In [None]:
applications

Splitting dataset

In [None]:
X, Y = applications.drop(columns=['category']), applications['category']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
x_test, x_validation, y_test, y_validation = train_test_split(x_train, y_train, test_size=0.4)

training_set = pandas.concat([x_train, y_train], axis=1)
validation_set = pandas.concat([x_validation, y_validation], axis=1)
testing_set = pandas.concat([x_test, y_test], axis=1)

Detecting anomalies

In [None]:
training_set.isna().sum()

In [None]:
validation_set.isna().sum()

In [None]:
testing_set.isna().sum()

Building Features

In [None]:
training_set = build_features.build_features(training_set)
validation_set = build_features.build_features(validation_set)
testing_set = build_features.build_features(testing_set)

Encoding training set using TF IDF Vectors

In [None]:
training_categories = training_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(text_data=training_set)

for category in training_categories:
    encoder.encode_categorical_documents(category=category)
training_set = encoder.get_dataframe()

Encoding testing set using TF/IDF Vectors

In [None]:
testing_categories = testing_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(text_data=testing_set)

for category in testing_categories:
    encoder.encode_categorical_documents(category=category)
testing_set = encoder.get_dataframe()

Encoding validation set using TF/IDF Vectors

In [None]:
validation_categories = validation_set['category'].unique()
encoder = text_encoding.TFIDFVectorizedDataset(text_data=validation_set)

for category in testing_categories:
    encoder.encode_categorical_documents(category=category)
testing_set = encoder.get_dataframe()

Saving datasets

In [None]:
training_set.to_csv(os.path.join(DATA_PATH, 'processed_data/training_set.csv'))
validation_set.to_csv(os.path.join(DATA_PATH, 'processed_data/validation_set.csv'))
testing_set.to_csv(os.path.join(DATA_PATH, 'processed_data/testing_set.csv'))