## Task 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import json
import gzip

### Helper functions

In [2]:
# parse the features

def parse_features(feature_file_path):
    rows = []

    with gzip.open(feature_file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)

            for transcript_id, positions in data.items():
                for position, flanking_data in positions.items():
                    for flanking_nucleotide, features_list in flanking_data.items():
                        for features in features_list:
                            row = {
                                "transcript_id": transcript_id,
                                "position": int(position),
                                "flanking_nucleotide": flanking_nucleotide,
                                "dwelling_time_(-1)": features[0],
                                "std_dev_(-1)": features[1],
                                "mean_signal_(-1)": features[2],
                                "dwelling_time_central": features[3],
                                "std_dev_central": features[4],
                                "mean_signal_central": features[5],
                                "dwelling_time_(+1)": features[6],
                                "std_dev_(+1)": features[7],
                                "mean_signal_(+1)": features[8]
                            }
                            rows.append(row)  # Append each parsed entry to rows

    return pd.DataFrame(rows)

### Preprocessing

In [3]:
import os
cwd = os.getcwd()
print(cwd)

/Users/felibunbun/Desktop/dsa4262


In [4]:
# load data
dataset = "./dataset0.json.gz"

# parse the features
df = parse_features(dataset)

# combine reads using average
combined_df = df.groupby(['transcript_id', 'position', 'flanking_nucleotide']).mean().reset_index()

# Scale inputs
scaler = StandardScaler()

numeric_columns = ['dwelling_time_(-1)', 'std_dev_(-1)', 'mean_signal_(-1)',
                   'dwelling_time_central', 'std_dev_central', 'mean_signal_central',
                   'dwelling_time_(+1)', 'std_dev_(+1)', 'mean_signal_(+1)']

standardized_df = combined_df.copy()
standardized_df[numeric_columns] = scaler.fit_transform(combined_df[numeric_columns])

In [5]:
# load data
dataset1 = "./dataset1.json.gz"

# parse the features
df1 = parse_features(dataset1)

# combine reads using average
combined_df1 = df1.groupby(['transcript_id', 'position', 'flanking_nucleotide']).mean().reset_index()

# Scale inputs
scaler = StandardScaler()

numeric_columns = ['dwelling_time_(-1)', 'std_dev_(-1)', 'mean_signal_(-1)',
                   'dwelling_time_central', 'std_dev_central', 'mean_signal_central',
                   'dwelling_time_(+1)', 'std_dev_(+1)', 'mean_signal_(+1)']

standardized_df1 = combined_df1.copy()
standardized_df1[numeric_columns] = scaler.fit_transform(combined_df1[numeric_columns])

In [6]:
# load data
dataset2 = "./dataset2.json.gz"

# parse the features
df2 = parse_features(dataset2)

# combine reads using average
combined_df2 = df2.groupby(['transcript_id', 'position', 'flanking_nucleotide']).mean().reset_index()

# Scale inputs
scaler = StandardScaler()

numeric_columns = ['dwelling_time_(-1)', 'std_dev_(-1)', 'mean_signal_(-1)',
                   'dwelling_time_central', 'std_dev_central', 'mean_signal_central',
                   'dwelling_time_(+1)', 'std_dev_(+1)', 'mean_signal_(+1)']

standardized_df2 = combined_df2.copy()
standardized_df2[numeric_columns] = scaler.fit_transform(combined_df2[numeric_columns])

### Random Forest Model

In [7]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

train_df = pd.read_csv('train.csv')

# Assume the last column is the target variable and others are features for both DataFrames
X_train = train_df.iloc[:, :-1]  # Features for training
y_train = train_df.iloc[:, -1]   # Target variable for training

# Identify categorical columns (this may vary based on your dataset)
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

# Create a ColumnTransformer to apply OneHotEncoder to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # Keep the numerical columns unchanged
)

# Create a pipeline that first transforms the data and then fits the model
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the Random Forest model
rf_pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
print(standardized_df2.head())

  transcript_id  position flanking_nucleotide  dwelling_time_(-1)  \
0       tx_id_0         0             AAAACCT           -0.116189   
1       tx_id_0        10             TGGACCC            1.621134   
2       tx_id_0        20             GGGACTA            2.676789   
3       tx_id_0        30             TGGACCA            0.546794   
4       tx_id_0        40             TAGACTA            0.455421   

   std_dev_(-1)  mean_signal_(-1)  dwelling_time_central  std_dev_central  \
0     -0.719520         -0.077971               0.111295         0.051595   
1     -0.295856          0.552197               1.338123         0.933801   
2     -0.383981          0.903571               0.626746        -0.099822   
3     -0.234535          0.483300              -0.490785        -0.059003   
4     -0.098278          1.009082               1.931992         0.273028   

   mean_signal_central  dwelling_time_(+1)  std_dev_(+1)  mean_signal_(+1)  
0            -0.558002           -0.501535   

### Model on dataset0

In [9]:
test_df = standardized_df # For dataset0

X_test = test_df

# Get probabilities for all classes
y_prob = rf_pipeline.predict_proba(X_test)[:, 1]

# Create a DataFrame with the probabilities
prob_df = pd.DataFrame(y_prob, columns=['score'])
results_df = pd.concat([test_df.reset_index(drop=True), prob_df], axis=1)

# Rename the 'position' column to 'transcript_position'
results_df.rename(columns={'position': 'transcript_position'}, inplace=True)

# Select only the required columns
final_df = results_df[['transcript_id', 'transcript_position', 'score']]
# Save the results to a CSV file if desired
final_df.to_csv('rf_output_dataset0.csv', index=False)


### Model on dataset1

In [10]:
test_df = standardized_df1 # For dataset1

X_test = test_df

# Get probabilities for all classes
y_prob = rf_pipeline.predict_proba(X_test)[:, 1]

# Create a DataFrame with the probabilities
prob_df = pd.DataFrame(y_prob, columns=['score'])
results_df = pd.concat([test_df.reset_index(drop=True), prob_df], axis=1)

# Rename the 'position' column to 'transcript_position'
results_df.rename(columns={'position': 'transcript_position'}, inplace=True)

# Select only the required columns
final_df = results_df[['transcript_id', 'transcript_position', 'score']]
# Save the results to a CSV file if desired
final_df.to_csv('rf_output_dataset1.csv', index=False)


### Model on dataset2

In [11]:
test_df = standardized_df2 # For dataset2

X_test = test_df

# Get probabilities for all classes
y_prob = rf_pipeline.predict_proba(X_test)[:, 1]

# Create a DataFrame with the probabilities
prob_df = pd.DataFrame(y_prob, columns=['score'])
results_df = pd.concat([test_df.reset_index(drop=True), prob_df], axis=1)

# Rename the 'position' column to 'transcript_position'
results_df.rename(columns={'position': 'transcript_position'}, inplace=True)

# Select only the required columns
final_df = results_df[['transcript_id', 'transcript_position', 'score']]
# Save the results to a CSV file if desired
final_df.to_csv('rf_output_dataset2.csv', index=False)
