In [17]:
#Importing data
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

#Importing data
metadata = pd.read_csv('metadata_labeled.csv')
metadata =metadata.drop_duplicates(subset='Run', keep='first')
metadata = metadata.drop('Label',axis=1)
multiqc = pd.read_csv('multiqc_data.csv')
label= pd.read_csv('labels_splitted.csv',sep='\t')
#Merge metad and execution dataframe on Run
df = pd.merge(metadata, multiqc, on='Run', how='inner')
df = pd.merge(df, label, on='Run', how='inner')
df.index = df['Run']
#Removing columns with no data
df = df.dropna(axis=1, how='all')
#Removing columns with only one unique value
df = df.loc[:,df.apply(pd.Series.nunique) != 1]
#Removing columns with more than 40% missing values
df = df.loc[:, df.isnull().mean() < .4]
#Removing columns with more than 40% zeros
df = df.loc[:, (df == 0).mean() < .4]
#Remove non-numeric columns
df = df.select_dtypes(include=[np.number])
df = df.loc[:,~df.columns.duplicated()]
df = df.drop(['None_mqc-generalstats-fastqc-percent_duplicates', 'None_mqc-generalstats-fastqc-percent_gc', 'None_mqc-generalstats-fastqc-avg_sequence_length', 'None_mqc-generalstats-fastqc-median_sequence_length', 'None_mqc-generalstats-fastqc-percent_fails', 'None_mqc-generalstats-fastqc-total_sequences'], axis=1)
#shuffle df
test_data = pd.read_csv('test_data.csv')
df_train = df.loc[~df.index.isin(test_data['Run'])]
df_test = df.loc[df.index.isin(test_data['Run'])]
# df = df.sample(frac=1)
X_train = df_train.iloc[:,0:-5] #features
y_train = df_train.iloc[:,-10:] #labels
X_test = df_test.iloc[:,0:-5] #features
y_test = df_test.iloc[:,-10:] #labels
sc=StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
del df, label, metadata, multiqc, sc

In [18]:
# Stage-I Fq2bam
# Train the model with M1 machine
machine_name = 'M1'  # Enter your machine name here
stage_name = 'fq2bam'  # Enter the stage name here
column_name = machine_name + '_' + stage_name
y_train_machine = y_train[column_name]
y_test_machine = y_test[column_name]

# Initialize and train the model
model = RandomForestRegressor()
model.fit(X_train, y_train_machine)

# Predict the values
y_pred = model.predict(X_test)

# Create a DataFrame for the actual and predicted values
df = pd.DataFrame({'Actual': y_test_machine, 'Predicted': y_pred})

# Save the DataFrame to a CSV file
df.to_csv('actual_vs_predicted_stage-1.csv', index=True)

# Calculate and print evaluation metrics
print("Results are for",machine_name,"machine and",stage_name,"stage")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_machine, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_machine, y_pred))
print('R2 Score:', metrics.r2_score(y_test_machine, y_pred))
print('Correlation:', df['Actual'].corr(df['Predicted']))
print("actual_vs_predicted.csv file is created in the current working directory")

# Display the first few rows of the DataFrame
df.head()

Results are for M5 machine and fq2bam stage
Mean Absolute Error: 24.40888888888888
Mean Squared Error: 1009.5521999999991
R2 Score: 0.983658827338013
Correlation: 0.9922895071672418
actual_vs_predicted.csv file is created in the current working directory


Unnamed: 0_level_0,Actual,Predicted
Run,Unnamed: 1_level_1,Unnamed: 2_level_1
ERR020241,1001,1026.34
ERR018491,1131,1059.13
ERR022463,801,845.09
ERR018454,692,743.95
ERR018442,658,722.92


In [19]:
# Stage-II haplotype_caller
# Train the model with M1 machine
stage_name = 'htvc'  # Enter the stage name here
column_name = machine_name + '_' + stage_name
y_train_machine = y_train[column_name]
y_test_machine = y_test[column_name]


# Initialize and train the model
model = RandomForestRegressor()
model.fit(X_train, y_train_machine)

# Predict the values
y_pred = model.predict(X_test)

# Create a DataFrame for the actual and predicted values
df = pd.DataFrame({'Actual': y_test_machine, 'Predicted': y_pred})

# Save the DataFrame to a CSV file
df.to_csv('actual_vs_predicted_stage-2.csv', index=True)

# Calculate and print evaluation metrics
print("Results are for",machine_name,"machine and",stage_name,"stage")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_machine, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_machine, y_pred))
print('R2 Score:', metrics.r2_score(y_test_machine, y_pred))
print('Correlation:', df['Actual'].corr(df['Predicted']))
print("actual_vs_predicted.csv file is created in the current working directory")

# Display the first few rows of the DataFrame
df.head()

Results are for M5 machine and htvc stage
Mean Absolute Error: 14.898333333333337
Mean Squared Error: 441.12981666666684
R2 Score: 0.9946753408659983
Correlation: 0.9973555431131532
actual_vs_predicted.csv file is created in the current working directory


Unnamed: 0_level_0,Actual,Predicted
Run,Unnamed: 1_level_1,Unnamed: 2_level_1
ERR020241,1151,1119.29
ERR018491,1106,1167.02
ERR022463,850,840.44
ERR018454,713,672.53
ERR018442,592,610.19
