In [1]:
import pandas as pd

import joblib
import os

In [2]:
# Loading the pretrained Random Forest Model
dirname = os.path.dirname(__name__) # Path to SGNexRF.ipynb
filename = os.path.join(dirname, '../model/RF_updated.joblib') # Relative path to model folder
rf = joblib.load(filename)

# Loading scaler used from training
filename2 = os.path.join(dirname, '../model/RF_scaler.joblib') # Relative path to model folder
scaler = joblib.load(filename2)


In [4]:
# Specifying file locations
data_folder = "../SGNexData"
output_folder = "../output"

# Running model predictions
for file in os.listdir(data_folder):
    if file.endswith(".csv"):
        df_file_path = os.path.join(data_folder,file)
        print(df_file_path)
        df = pd.read_csv(df_file_path)

        #Rename columns:
        df = df.rename(columns={
            'ENST_ID': 'transcript_id',
            'Position': 'transcript_position',
            'ENST_ID_encoded': 'transcript_id_encoded'
        })

        #Drop features that are not needed
        X = df.drop(columns=['transcript_id','Key'])

        # Scale new input data
        X_scaled = scaler.transform(X)

        # Run model predictions
        predicted_probabilities = rf.predict_proba(X_scaled)[:, 1]

        # Create output submission file in required format
        # transcript_id, transcript_position, score
        output = df[['transcript_id', 'transcript_position']]

        # Add in prediction scores as 'score' column
        output['score'] = pd.Series(predicted_probabilities)

        # Obtain the average of the scores and print it out
        avg_pred_score = output['score'].mean()
        print(f"Average Predicted Score for {file}: {avg_pred_score}")

        # Saving final output file for comparison across cell lines
        output_file_name = file.replace("_processed", "_RF_predict")
        csv_file_path = os.path.join(output_folder, output_file_name)
        output.to_csv(csv_file_path, index=False)

../SGNexData/HepG2R6R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for HepG2R6R1_processed.csv: 0.14836255583459543
../SGNexData/K562R5R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for K562R5R1_processed.csv: 0.1442863630188457
../SGNexData/Hct116R3R4_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for Hct116R3R4_processed.csv: 0.1527599856892133
../SGNexData/Hct116R4R3_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for Hct116R4R3_processed.csv: 0.15716524550444538
../SGNexData/K562R4R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for K562R4R1_processed.csv: 0.14330856068754957
../SGNexData/A549R5R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for A549R5R1_processed.csv: 0.14677226337130414
../SGNexData/HepG2R5R2_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for HepG2R5R2_processed.csv: 0.13451557891234478
../SGNexData/K562R6R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for K562R6R1_processed.csv: 0.14869921929736762
../SGNexData/Hct116R3R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for Hct116R3R1_processed.csv: 0.14298592612292718
../SGNexData/MCF7R4R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for MCF7R4R1_processed.csv: 0.14368808358379065
../SGNexData/MCF7R3R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for MCF7R3R1_processed.csv: 0.14452192489803353
../SGNexData/A549R6R1_processed.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output['score'] = pd.Series(predicted_probabilities)


Average Predicted Score for A549R6R1_processed.csv: 0.14816239598181766
