In [None]:
import os
import sys
sys.path.append(os.path.abspath('..'))

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from feature_extraction import ConventionalFeatures, MurugaiahFeatures
import utils, os

In [2]:
settings = utils.load_settings(path="settings_sample.json")
cleaned_data_dir = settings["cleaned_data_dir"]
plots_dir = settings["plots_dir"]
features_dir = settings["features_dir"]

In [3]:
df = pd.read_parquet(os.path.join(cleaned_data_dir, "cleaned_dataset.parquet"))

In [4]:
df

Unnamed: 0,Accession ID,lineage,col_date,country,sub_date,date,sequence,full_lineage,lineage_hierarchy
0,EPI_ISL_1628772,B.1.1.353,2021-02-02,Reunion,2021-04-17,2021-02-02,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,B.1.1.353,"[B, B.1, B.1.1, B.1.1.353]"
1,EPI_ISL_1166942,B.1.160.18,2021-01-18,Mayotte,2021-03-05,2021-01-18,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,B.1.160.18,"[B, B.1, B.1.160, B.1.160.18]"
2,EPI_ISL_1018072,R.1,2021-01-10,Ghana,2021-02-12,2021-01-10,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,B.1.1.316.1,"[B, B.1, B.1.1, B.1.1.316, B.1.1.316.1]"
3,EPI_ISL_729972,B.1.1,2020-07-07,Nigeria,2020-12-23,2020-07-07,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...,B.1.1,"[B, B.1, B.1.1]"
4,EPI_ISL_1700676,B.1.351,2021-01-12,Equatorial Guinea,2021-04-23,2021-01-12,AGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTC...,B.1.351,"[B, B.1, B.1.351]"
...,...,...,...,...,...,...,...,...,...
40557,EPI_ISL_15755653,B.1.351,2020-12-28,South Africa,2022-11-13,2020-12-28,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,B.1.351,"[B, B.1, B.1.351]"
40558,EPI_ISL_15722311,BA.5.11,2022-10-17,South Africa,2022-11-09,2022-10-17,TTGTAGATCTGTTCTCTAAACGAACTAGGTTCGCGACGTGCTCGTA...,B.1.1.529.5.11,"[B, B.1, B.1.1, B.1.1.529, B.1.1.529.5, B.1.1...."
40559,EPI_ISL_15722308,BA.5.11,2022-10-25,South Africa,2022-11-09,2022-10-25,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,B.1.1.529.5.11,"[B, B.1, B.1.1, B.1.1.529, B.1.1.529.5, B.1.1...."
40560,EPI_ISL_15722305,BA.5.11,2022-10-26,South Africa,2022-11-09,2022-10-26,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...,B.1.1.529.5.11,"[B, B.1, B.1.1, B.1.1.529, B.1.1.529.5, B.1.1...."


# k-mer counting

In [11]:
kmer_features_6 = ConventionalFeatures().extract_kmers_features_(df['sequence'], k=2, normalize=False)

Processing sequences: 100%|██████████| 42037/42037 [00:00<00:00, 369269.73it/s]


In [None]:
table = pa.table(pd.DataFrame(kmer_features_6))
pq.write_table(table, os.path.join(features_dir, "kmer_features_6.parquet"))

In [None]:
kmer_features_5 = ConventionalFeatures().extract_kmers_features(df['sequence'], k=5, normalize=False)

In [None]:
table = pa.table(pd.DataFrame(kmer_features_5))
pq.write_table(table, os.path.join(features_dir, "kmer_features_5.parquet"))

# Frequency Chaos Game Representation

In [None]:
fcgr_features = ConventionalFeatures().extract_fcgr_features(df['sequence'], resolution=128)

In [None]:
table = pa.table(pd.DataFrame(fcgr_features.reshape((42037, -1))))
pq.write_table(table, os.path.join(features_dir, "fcgr_features.parquet"))

# Murugaiah and Ganesan Features

In [5]:
murugaiah_features = MurugaiahFeatures().extract_features(df['sequence'])

In [6]:
table = pa.table(pd.DataFrame(murugaiah_features))
pq.write_table(table, os.path.join(features_dir, "murugaiah_features.parquet"))