In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from feature_extraction import ConventionalFeatures, MurugaiahFeatures
import utils, os

In [None]:
settings = utils.load_settings()
cleaned_data_dir = settings["cleaned_data_dir"]
plots_dir = settings["plots_dir"]
features_dir = settings["features_dir"]

In [None]:
df = pd.read_parquet(os.path.join(cleaned_data_dir, "cleaned_dataset.parquet"))

# k-mer counting

In [None]:
kmer_features_6 = ConventionalFeatures().extract_kmers_features(df['sequence'], k=6, normalize=False)

In [None]:
table = pa.table(pd.DataFrame(kmer_features_6))
pq.write_table(table, os.path.join(features_dir, "kmer_features_6.parquet"))

In [None]:
kmer_features_5 = ConventionalFeatures().extract_kmers_features(df['sequence'], k=5, normalize=False)

In [None]:
table = pa.table(pd.DataFrame(kmer_features_5))
pq.write_table(table, os.path.join(features_dir, "kmer_features_5.parquet"))

# Frequency Chaos Game Representation

In [None]:
fcgr_features = ConventionalFeatures().extract_fcgr_features(df['sequence'], resolution=128)

In [None]:
table = pa.table(pd.DataFrame(fcgr_features.reshape((42037, -1))))
pq.write_table(table, os.path.join(features_dir, "fcgr_features.parquet"))

# Murugaiah and Ganesan Features

In [None]:
murugaiah_features = MurugaiahFeatures().extract(df['sequence'])

In [None]:
table = pa.table(pd.DataFrame(murugaiah_features))
pq.write_table(table, os.path.join(features_dir, "murugaiah_features.parquet"))