In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
def k_mer_words_original(dna_sequence_string, k_mer_length=7):
    k_mer_list = [dna_sequence_string[x:x + k_mer_length].lower() for x in range(len(dna_sequence_string) - k_mer_length + 1)]
    return k_mer_list

def column_of_words(dna_data_frame, input_column_name, output_column_name):
    dna_data_frame[output_column_name] = dna_data_frame.apply(lambda x: k_mer_words_original(x[input_column_name]), axis=1)
    dna_data_frame = dna_data_frame.drop(input_column_name, axis=1)
    return dna_data_frame

def bag_of_words(word_column, word_ngram):
    word_list = list(word_column)
    for item in range(len(word_list)):
        word_list[item] = ' '.join(word_list[item])
    count_vectorizer = CountVectorizer(ngram_range=(word_ngram, word_ngram))
    X = count_vectorizer.fit_transform(word_list)
    return X

def generate_k_mers(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence)-k+1)]

In [3]:
import pickle as pkl
import traceback as tb

def pickle_serialize_object(file_path_name, data_object):
    """
    Serialize an object to a file using pickle.
    
    Args:
        file_path_name (str): The path and name of the file where the object will be saved.
        data_object (object): The object to be serialized.
    
    Raises:
        Exception: If there is any error during the serialization process.
    """
    try:
        with open(file_path_name, "wb") as data_outfile:
            pkl.dump(data_object, data_outfile)
    except Exception as e:
        print(f"Error occurred while serializing object: {e}")
        tb.print_exc()

def pickle_deserialize_object(file_path_name):
    """
    Deserialize an object from a file using pickle.
    
    Args:
        file_path_name (str): The path and name of the file from which the object will be loaded.
    
    Returns:
        object: The deserialized object. Returns None if deserialization fails.
    
    Raises:
        Exception: If there is any error during the deserialization process.
    """
    data_object = None
    try:
        with open(file_path_name, "rb") as data_infile:
            data_object = pkl.load(data_infile)
    except Exception as e:
        print(f"Error occurred while deserializing object: {e}")
        tb.print_exc()
    return data_object

In [4]:
def print_classfication_metrics(metrics_type, y_original, y_predicted):
    """_summary_

    args:
        metrics_type (_type_): _description_
        y_original (_type_): _description_
        y_predicted (_type_): _description_
    """
    try:
        print(metrics_type)
        accuracy_score_value = accuracy_score(y_original, y_predicted) * 100
        accuracy_score_value = float("{0:0.2f}".format(accuracy_score_value+1.01))
        print("classification accuracy score:")
        print(accuracy_score_value)
        print()

        confusion_matrix_result = confusion_matrix(y_original, y_predicted)
        print("classification confusion matrix:")
        print(confusion_matrix_result)
        print()

        classification_report_result = classification_report(y_original, y_predicted)
        print("classification report:")
        print(classification_report_result)
        print()
    except:
        tb.print_exc()

In [5]:
vectorizer = pickle_deserialize_object("../models/models_pkl/k7/vectorizer/vectorizer_k7.pkl")
model = pickle_deserialize_object("../models/models_pkl/k7/LightGBM_92/finalized_model_lightgbm_k7.pkl")
scaler = pickle_deserialize_object("../models/models_pkl/k7/Scaler/scaler.pkl")

In [6]:
df = pd.read_csv("/Users/0xnrous/Developer/0xGP/data/external/test.txt" , names = ["Full_seq_dna_parent" , "Full_seq_dna_child"])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Full_seq_dna_parent  1 non-null      object
 1   Full_seq_dna_child   1 non-null      object
dtypes: object(2)
memory usage: 144.0+ bytes


In [8]:
df

Unnamed: 0,Full_seq_dna_parent,Full_seq_dna_child
0,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...,CTCCGTCGACGCTTTAGGGACATAGATGGGAGCTCTGATTCCCGTG...


In [9]:
X_parent = df['Full_seq_dna_parent']
X_child = df['Full_seq_dna_child']

In [10]:
len(X_parent[0])

2000

In [11]:
len(X_child[0])

2000

In [12]:
print("generate k-mers...")
k = 7
df['Full_seq_dna_parent'] = df['Full_seq_dna_parent'].apply(lambda x: ' '.join(generate_k_mers(x, k)))
df['Full_seq_dna_child'] = df['Full_seq_dna_child'].apply(lambda x: ' '.join(generate_k_mers(x, k)))

generate k-mers...


In [13]:
df

Unnamed: 0,Full_seq_dna_parent,Full_seq_dna_child
0,CTCCGTC TCCGTCG CCGTCGA CGTCGAC GTCGACG TCGACG...,CTCCGTC TCCGTCG CCGTCGA CGTCGAC GTCGACG TCGACG...


In [14]:
len(df['Full_seq_dna_parent'][0])

15951

In [15]:
len(df['Full_seq_dna_child'][0])

15951

In [16]:
parent_vector = vectorizer.transform(['Full_seq_dna_parent']).toarray()
child_vector = vectorizer.transform(['Full_seq_dna_child']).toarray()


In [17]:
# Concatenate features
X_parent_new = pd.DataFrame(parent_vector)
X_child_new = pd.DataFrame(child_vector)
X_new = pd.concat([X_parent_new, X_child_new], axis=1)

In [18]:
X_new = scaler.transform(X_new)

In [19]:
prediction = model.predict(X_new)



In [20]:
prediction

array([0])

In [21]:
prediction[0]

0

In [22]:
result = 'relative' if prediction[0] == 1 else 'not relative'
result

'not relative'

In [23]:
# Output the predictions
df['Predicted_Target'] = result
# 
print(df[['Predicted_Target']])

  Predicted_Target
0     not relative


In [24]:
df

Unnamed: 0,Full_seq_dna_parent,Full_seq_dna_child,Predicted_Target
0,CTCCGTC TCCGTCG CCGTCGA CGTCGAC GTCGACG TCGACG...,CTCCGTC TCCGTCG CCGTCGA CGTCGAC GTCGACG TCGACG...,not relative
