In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from feature_extraction import ConventionalFeatures, MurugaiahFeatures
import utils, os

%config InlineBackend.figure_format = 'svg'

In [2]:
settings = utils.load_settings()
cleaned_data_dir = settings["cleaned_data_dir"]
plots_dir = settings["plots_dir"]
features_dir = settings["features_dir"]

In [3]:
df = pd.read_parquet(os.path.join(cleaned_data_dir, "cleaned_dataset.parquet"))

# k-mer counting

In [12]:
kmer_features_6 = ConventionalFeatures().extract_kmers_features(df['sequence'], k=6, normalize=False)
print(kmer_features_6.shape)

(1000, 4096)


In [10]:
table = pa.table(pd.DataFrame(kmer_features_6))
pq.write_table(table, os.path.join(features_dir, "kmer_features_6.parquet"))

In [11]:
kmer_features_5 = ConventionalFeatures().extract_kmers_features(df['sequence'], k=5, normalize=False)
print(kmer_features_5.shape)

(43167, 1024)


In [12]:
table = pa.table(pd.DataFrame(kmer_features_5))
pq.write_table(table, os.path.join(features_dir, "kmer_features_5.parquet"))

# Frequency Chaos Game Representation

In [4]:
fcgr_features = ConventionalFeatures().extract_fcgr_features(df['sequence'], resolution=128)
print(fcgr_features.shape)

(1000, 128, 128)


In [None]:
table = pa.table(pd.DataFrame(fcgr_features))
pq.write_table(table, os.path.join(features_dir, "fcgr_features.parquet"))

# Murugaiah and Ganesan Features

In [11]:
murugaiah_features = MurugaiahFeatures().extract(df['sequence'])

In [23]:
table = pa.table(pd.DataFrame(murugaiah_features))
pq.write_table(table, os.path.join(features_dir, "murugaiah_features.parquet"))