In [1]:
# Source of the DS: https://archive.ics.uci.edu/dataset/577/codon+usage

In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix


from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Load the DS
df = pd.read_csv('datasets/codon_usage.csv')
df.head()

  df = pd.read_csv('datasets/codon_usage.csv')


Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,vrl,0,100217,1995,Epizootic haematopoietic necrosis virus,0.01654,0.01203,0.0005,0.00351,0.01203,...,0.00451,0.01303,0.03559,0.01003,0.04612,0.01203,0.04361,0.00251,0.0005,0.0
1,vrl,0,100220,1474,Bohle iridovirus,0.02714,0.01357,0.00068,0.00678,0.00407,...,0.00136,0.01696,0.03596,0.01221,0.04545,0.0156,0.0441,0.00271,0.00068,0.0
2,vrl,0,100755,4862,Sweet potato leaf curl virus,0.01974,0.0218,0.01357,0.01543,0.00782,...,0.00596,0.01974,0.02489,0.03126,0.02036,0.02242,0.02468,0.00391,0.0,0.00144
3,vrl,0,100880,1915,Northern cereal mosaic virus,0.01775,0.02245,0.01619,0.00992,0.01567,...,0.00366,0.0141,0.01671,0.0376,0.01932,0.03029,0.03446,0.00261,0.00157,0.0
4,vrl,0,100887,22831,Soil-borne cereal mosaic virus,0.02816,0.01371,0.00767,0.03679,0.0138,...,0.00604,0.01494,0.01734,0.04148,0.02483,0.03359,0.03679,0.0,0.00044,0.00131


In [4]:
df.info()
# that is a huge ds

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13028 entries, 0 to 13027
Data columns (total 69 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Kingdom      13028 non-null  object 
 1   DNAtype      13028 non-null  int64  
 2   SpeciesID    13028 non-null  int64  
 3   Ncodons      13028 non-null  int64  
 4   SpeciesName  13028 non-null  object 
 5   UUU          13028 non-null  object 
 6   UUC          13028 non-null  object 
 7   UUA          13028 non-null  float64
 8   UUG          13028 non-null  float64
 9   CUU          13028 non-null  float64
 10  CUC          13028 non-null  float64
 11  CUA          13028 non-null  float64
 12  CUG          13028 non-null  float64
 13  AUU          13028 non-null  float64
 14  AUC          13028 non-null  float64
 15  AUA          13028 non-null  float64
 16  AUG          13028 non-null  float64
 17  GUU          13028 non-null  float64
 18  GUC          13028 non-null  float64
 19  GUA 

In [5]:
df.columns

Index(['Kingdom', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName', 'UUU',
       'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA',
       'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU',
       'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC',
       'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC',
       'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG',
       'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG',
       'UAA', 'UAG', 'UGA'],
      dtype='object')

In [6]:
# Let's check the duplicated rows
print(f'DS shape:',df.shape)
number_of_duplicated_rows =df.duplicated().sum()
print(f'Number of duplicated rows:{number_of_duplicated_rows}')
# I dont have duplicated rows

DS shape: (13028, 69)
Number of duplicated rows:0


In [7]:
# let's check do I have columns with only one unique value
unique_values = df.columns[df.nunique() == 1]
unique_values

Index([], dtype='object')

In [8]:
df["Kingdom"].value_counts().sort_index()

Kingdom
arc     126
bct    2920
inv    1345
mam     572
phg     220
plm      18
pln    2523
pri     180
rod     215
vrl    2832
vrt    2077
Name: count, dtype: int64

In [9]:
# Kingdom colum shows biological groups,
# bct = bacteria, vrl = virus
# pln = plant
# inv = invertebrate, mam = mammal 
# To understand better those abbreviations, I used QWEN 3MAX.

In [10]:
#  I am going to drop those classes which have few samples.
df= df[~df["Kingdom"].isin(["arc", "inv", "mam", "phg", "plm",  "pri", "rod"])]
df.shape

(10352, 69)

In [11]:
# Basic description of the DS:
# SpeciesID is kid of ID number for organism and SpeciesName is the name of the entery organism
duplicate_species_id= int(df["SpeciesID"].duplicated().sum())
duplicate_species_name =int(df["SpeciesName"].duplicated().sum())
print("DUPLICATES SpeciesID:",duplicate_species_id)
print("DUPLICATES SpeciesName:",duplicate_species_name)


# Having ID number and Species name is cheat code for model.
df = df.drop(columns=['SpeciesID','SpeciesName' , 'DNAtype'])

DUPLICATES SpeciesID: 408
DUPLICATES SpeciesName: 12


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10352 entries, 0 to 12060
Data columns (total 66 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Kingdom  10352 non-null  object 
 1   Ncodons  10352 non-null  int64  
 2   UUU      10352 non-null  object 
 3   UUC      10352 non-null  object 
 4   UUA      10352 non-null  float64
 5   UUG      10352 non-null  float64
 6   CUU      10352 non-null  float64
 7   CUC      10352 non-null  float64
 8   CUA      10352 non-null  float64
 9   CUG      10352 non-null  float64
 10  AUU      10352 non-null  float64
 11  AUC      10352 non-null  float64
 12  AUA      10352 non-null  float64
 13  AUG      10352 non-null  float64
 14  GUU      10352 non-null  float64
 15  GUC      10352 non-null  float64
 16  GUA      10352 non-null  float64
 17  GUG      10352 non-null  float64
 18  GCU      10352 non-null  float64
 19  GCC      10352 non-null  float64
 20  GCA      10352 non-null  float64
 21  GCG      10352 no

In [13]:
# UUU and UUC are objects, these columns should be float
# I got an error :ValueError: could not convert string to float: 'non-B hepatitis virus'
# It seems UUU and UUC have some string values.
df['UUU']=pd.to_numeric(df['UUU'],errors='coerce')
df['UUC'] = pd.to_numeric(df['UUC'],errors='coerce')

# Let's check how many NaN values are there now
df[['UUU','UUC']].isna().sum()

UUU    2
UUC    1
dtype: int64

In [14]:
# there are not many NaN values, I can drop 
df =df.dropna(subset=['UUU','UUC'])

In [15]:
# Now I have to convert Kingdom column to numerical values
class_encoder =LabelEncoder()
df['Kingdom']= class_encoder.fit_transform(df['Kingdom'])
df['Kingdom'].value_counts()

print("class mapping:",dict(zip(class_encoder.classes_,range(len(class_encoder.classes_)))))
# classes are imbalanced, I will handle it later.

class mapping: {'bct': 0, 'pln': 1, 'vrl': 2, 'vrt': 3}


In [16]:
summary = df.describe().T
summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Kingdom,10350.0,1.392850,1.097290,0.0,0.000000,1.000000,2.000000,3.000000e+00
Ncodons,10350.0,80976.708599,593271.618032,1000.0,1576.000000,2904.000000,9768.500000,3.413228e+07
UUU,10350.0,0.023592,0.014709,0.0,0.013590,0.021820,0.031140,1.298300e-01
UUC,10350.0,0.022631,0.011124,0.0,0.015030,0.021345,0.028047,9.169000e-02
UUA,10350.0,0.019010,0.017759,0.0,0.005258,0.014940,0.028170,1.411300e-01
...,...,...,...,...,...,...,...,...
GAA,10350.0,0.028824,0.014110,0.0,0.018162,0.027100,0.037120,1.448900e-01
GAG,10350.0,0.022366,0.014445,0.0,0.011633,0.021305,0.031238,1.585500e-01
UAA,10350.0,0.001546,0.001441,0.0,0.000520,0.001300,0.002270,1.942000e-02
UAG,10350.0,0.000588,0.000848,0.0,0.000000,0.000430,0.000840,2.561000e-02


In [17]:
# Ncodons is skewed,  what I see from median and mean. 
# UAG, UAA, and UGA are very close to zero, when I asked GPT, the response was: 
# These are condon, which instruction for cell to adds one amino acid for building  protein. 
# When it is very low that means, STOP building the chain. 

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10350 entries, 0 to 12060
Data columns (total 66 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Kingdom  10350 non-null  int32  
 1   Ncodons  10350 non-null  int64  
 2   UUU      10350 non-null  float64
 3   UUC      10350 non-null  float64
 4   UUA      10350 non-null  float64
 5   UUG      10350 non-null  float64
 6   CUU      10350 non-null  float64
 7   CUC      10350 non-null  float64
 8   CUA      10350 non-null  float64
 9   CUG      10350 non-null  float64
 10  AUU      10350 non-null  float64
 11  AUC      10350 non-null  float64
 12  AUA      10350 non-null  float64
 13  AUG      10350 non-null  float64
 14  GUU      10350 non-null  float64
 15  GUC      10350 non-null  float64
 16  GUA      10350 non-null  float64
 17  GUG      10350 non-null  float64
 18  GCU      10350 non-null  float64
 19  GCC      10350 non-null  float64
 20  GCA      10350 non-null  float64
 21  GCG      10350 no

In [None]:
# My aim was to merge redundant columns, 64 columns is too much for analysis.
X = df.drop(columns=['Kingdom'])
y= df['Kingdom']
print(X.shape, y.shape)
print(y.value_counts())

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

(10350, 65) (10350,)
Kingdom
0    2919
2    2831
1    2523
3    2077
Name: count, dtype: int64


In [20]:
# Using  selectKBest to reduce number of features
selector = SelectKBest(score_func=mutual_info_classif, k=15)
selector.fit_transform(x_train, y_train)

# let's print the top features with their scores
top_features= x_train.columns[selector.get_support()].tolist()

score_df= pd.DataFrame({"Feature": x_train.columns,"Score": selector.scores_}).sort_values("Score",ascending=False)
print("Top features:\n", score_df.head(15))

Top features:
    Feature     Score
57     AGG  0.449293
7      CUA  0.443030
56     AGA  0.408259
64     UGA  0.370496
51     AAG  0.357965
46     UGU  0.326524
38     ACA  0.306329
61     GAG  0.304889
58     GAU  0.304501
20     GCG  0.300535
23     CCA  0.290341
43     CAG  0.274587
50     AAA  0.267415
24     CCG  0.262520
27     GGC  0.256738


In [21]:
# Now,I can keep only those 15 features
df= df[score_df['Feature'].head(15).values.tolist()+['Kingdom']]
df.shape

(10350, 16)

In [22]:
df.head()

Unnamed: 0,AGG,CUA,AGA,UGA,AAG,UGU,ACA,GAG,GAU,GCG,CCA,CAG,AAA,CCG,GGC,Kingdom
0,0.03559,0.001,0.01303,0.0,0.0386,0.00251,0.00902,0.04361,0.01003,0.01103,0.01203,0.03108,0.01053,0.00501,0.03158,2
1,0.03596,0.00204,0.01696,0.0,0.03392,0.00271,0.01425,0.0441,0.01221,0.01357,0.01221,0.02374,0.00543,0.00407,0.01967,2
2,0.02489,0.01028,0.01974,0.00144,0.03949,0.01625,0.01419,0.02468,0.03126,0.00514,0.02098,0.02365,0.02077,0.0107,0.00864,2
3,0.01671,0.0094,0.0141,0.0,0.04282,0.00992,0.02089,0.03446,0.0376,0.00522,0.0141,0.01253,0.03133,0.00574,0.00366,2
4,0.01734,0.00473,0.01494,0.00131,0.03964,0.01082,0.012,0.03679,0.04148,0.01577,0.00604,0.01809,0.03408,0.00679,0.00775,2


In [23]:
# Handling the imbalanced classes:
# I am aiming to create a pipeline to try different method like smote, random oversampling, ADASYN and compare them 
x = df.drop(columns=['Kingdom'])
y = df['Kingdom']
print(y.value_counts())
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42, stratify=y)

Kingdom
0    2919
2    2831
1    2523
3    2077
Name: count, dtype: int64


In [24]:
# Handling the imbalanced classes based on macro f1, which means all classes are important equally.
# The highest average f1 score across all classes is the best.
# I am gonna use 3 different methods: RandomOverSampler, SMOTE, BorderlineSMOTE 

# Training with  original data
# RandomOverSampler, this method copies the miniority 
# smote, creating new samples of minority 
# Border, that is similar to smote but it focuses on borderline samples.

oversampling_setup ={"Base":None,"ROS":RandomOverSampler(random_state=42),  
                    "SMOTE":SMOTE(random_state=42),"BORDER":BorderlineSMOTE(random_state=42)}



best_f1_average= ("", float("-inf"), float("-inf"))
for name, method in oversampling_setup.items():
    # First, I am going to use the original data without oversampling.
    if method is None:
        x_train_oversampled, y_train_oversampled = x_train, y_train
    else :
        x_train_oversampled, y_train_oversampled= method.fit_resample(x_train, y_train)

    print(f"\n{name} train counts after:",Counter(y_train_oversampled))
    model = RandomForestClassifier(n_estimators=500, random_state=42,n_jobs=-1).fit(x_train_oversampled, y_train_oversampled)
    predicted_class = model.predict(X_test)
    # Calculating the f1 score for each class 
    f_score_per_class =f1_score(y_test, predicted_class, average=None)
    macro= f1_score(y_test, predicted_class, average="macro")
    smallest_value =float(f_score_per_class.min())

    print("macro_f1:",round(macro,4),"per_class_f1:",[round(score,4) for score in f_score_per_class])
    if (macro>best_f1_average[1])or(macro==best_f1_average[1]and smallest_value>best_f1_average[2]):best_f1_average=(name, macro, smallest_value)

print(f"\nBest F1 average: {best_f1_average[0]}. macro={best_f1_average[1]:.4f}, min_class_f1={best_f1_average[2]:.4f}")
# ROS model has the best macro f1 score.
# For debuuging the tuple initialization with sentinel values and best score tracking, I used QWEN 3MAX.


Base train counts after: Counter({0: 2189, 2: 2123, 1: 1892, 3: 1558})
macro_f1: 0.9646 per_class_f1: [0.9726, 0.9473, 0.9591, 0.9795]

ROS train counts after: Counter({1: 2189, 0: 2189, 3: 2189, 2: 2189})
macro_f1: 0.9656 per_class_f1: [0.9726, 0.9497, 0.9576, 0.9825]

SMOTE train counts after: Counter({1: 2189, 0: 2189, 3: 2189, 2: 2189})


found 0 physical cores < 1
  File "d:\ML_DE\Advanced-Data-Analytics-\.venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


macro_f1: 0.9649 per_class_f1: [0.9677, 0.9476, 0.9617, 0.9825]

BORDER train counts after: Counter({1: 2189, 0: 2189, 3: 2189, 2: 2189})
macro_f1: 0.9653 per_class_f1: [0.9684, 0.9482, 0.9602, 0.9845]

Best F1 average: ROS. macro=0.9656, min_class_f1=0.9497
