In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
data = pd.read_csv('./data/real1/snv-parse-real1.txt', sep='\t', dtype={'Chr': str})
data['Chr'] = data['Chr'].astype('str') 

In [4]:
truth = pd.read_csv('./real1/real1_truth.bed', sep='\t', header=None)

In [5]:
data = pd.read_csv('./data/real1/snv-parse-real1.txt', sep='\t', dtype={'Chr': str})

truth = pd.read_csv('./data/real1/real1_truth.bed', sep='\t', header=None, names=['Chr', 'START_POS_REF', 'END_POS_REF'])

#'Chr' is treated as string in both datasets
data['Chr'] = data['Chr'].astype(str)
truth['Chr'] = truth['Chr'].astype(str)

# Create a set of true SNV positions from the BED file
true_snv_set = set(zip(truth['Chr'], truth['START_POS_REF'], truth['END_POS_REF']))

# Add a new column 'True_SNV' to indicate if the SNV is in ground truth
data['True_SNV'] = data.apply(lambda row: (row['Chr'], row['START_POS_REF'], row['END_POS_REF']) in true_snv_set, axis=1)

# Save the final dataset with the new column
data.to_csv('./data/real1/snv-parse-real1-labeled.txt', sep='\t', index=False)

In [6]:
data = pd.read_csv('./data/real1/snv-parse-real1-labeled.txt', sep='\t', dtype={'Chr': str})
data.head()

Unnamed: 0,Chr,START_POS_REF,END_POS_REF,REF,ALT,REF_MFVdVs,ALT_MFVdVs,Sample_Name,FILTER_Mutect2,FILTER_Freebayes,FILTER_Vardict,FILTER_Varscan,m2_MQ,f_MQMR,vs_SSC,vs_SPV,vd_SSF,vd_MSI,True_SNV
0,1,13110,13110,G,A,G/NA/G/G/,A/NA/A/A/,icgc_cll-T,True,False,False,False,41.91,,2.0,0.52243,0.23427,2.0,False
1,1,15015,15015,G,C,G/NA/NA/G/,C/NA/NA/C/,icgc_cll-T,True,False,False,False,43.42,,5.0,0.30239,,,False
2,1,16949,16949,A,C,NA/NA/NA/A/,NA/NA/NA/C/,icgc_cll-T,False,False,False,True,,,16.0,0.023282,,,False
3,1,40552,40552,T,C,NA/NA/NA/T/,NA/NA/NA/C/,icgc_cll-T,False,False,False,True,,,26.0,0.002231,,,False
4,1,46907,46907,T,C,NA/NA/NA/T/,NA/NA/NA/C/,icgc_cll-T,False,False,False,True,,,17.0,0.01767,,,False


In [7]:
# clean columns
# Drop non-numeric variables
non_numeric_cols = ['REF', 'ALT', 'REF_MFVdVs', 'ALT_MFVdVs', 'Sample_Name']
data = data.drop(columns=non_numeric_cols)
data.head()

Unnamed: 0,Chr,START_POS_REF,END_POS_REF,FILTER_Mutect2,FILTER_Freebayes,FILTER_Vardict,FILTER_Varscan,m2_MQ,f_MQMR,vs_SSC,vs_SPV,vd_SSF,vd_MSI,True_SNV
0,1,13110,13110,True,False,False,False,41.91,,2.0,0.52243,0.23427,2.0,False
1,1,15015,15015,True,False,False,False,43.42,,5.0,0.30239,,,False
2,1,16949,16949,False,False,False,True,,,16.0,0.023282,,,False
3,1,40552,40552,False,False,False,True,,,26.0,0.002231,,,False
4,1,46907,46907,False,False,False,True,,,17.0,0.01767,,,False


In [8]:
# clean columns
# Drop nan variables
non_numeric_cols = ['m2_MQ', 'f_MQMR', 'vs_SSC', 'vs_SPV', 'vd_SSF', 'vd_MSI']
data = data.drop(columns=non_numeric_cols)
data.head()

Unnamed: 0,Chr,START_POS_REF,END_POS_REF,FILTER_Mutect2,FILTER_Freebayes,FILTER_Vardict,FILTER_Varscan,True_SNV
0,1,13110,13110,True,False,False,False,False
1,1,15015,15015,True,False,False,False,False
2,1,16949,16949,False,False,False,True,False
3,1,40552,40552,False,False,False,True,False
4,1,46907,46907,False,False,False,True,False


In [9]:
data.isnull().sum()

Chr                 0
START_POS_REF       0
END_POS_REF         0
FILTER_Mutect2      0
FILTER_Freebayes    0
FILTER_Vardict      0
FILTER_Varscan      0
True_SNV            0
dtype: int64

In [10]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X

array([['1', 13110, 13110, ..., False, False, False],
       ['1', 15015, 15015, ..., False, False, False],
       ['1', 16949, 16949, ..., False, False, True],
       ...,
       ['Y', 59030255, 59030255, ..., False, False, False],
       ['Y', 59031439, 59031439, ..., False, False, False],
       ['Y', 59031799, 59031799, ..., False, False, False]],
      shape=(49320, 7), dtype=object)

In [11]:
# Clean the data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
data = pd.read_csv('./data/real1/snv-parse-real1-labeled.txt', sep='\t', dtype={'Chr': str})
non_numeric_cols = ['REF', 'ALT', 'REF_MFVdVs', 'ALT_MFVdVs', 'Sample_Name', 'm2_MQ', 'f_MQMR', 'vs_SSC', 'vs_SPV', 'vd_SSF', 'vd_MSI']
data = data.drop(columns=non_numeric_cols)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# One-hot encode 'Chr
print(X.shape)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(data).toarray() # Convert to numpy array
X

(49320, 7)


array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(49320, 80))

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0, stratisfy=y)

In [20]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

classifier = LinearSVC(random_state=0, dual=False)  # `dual=False` improves performance for large datasets
model = classifier.fit(X_train, y_train)
print(model)

LinearSVC(dual=False, random_state=0)


In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

y_pred = classifier.predict(X_test)

confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [22]:
confusion

array([[24023,     0],
       [  637,     0]])

In [23]:
accuracy

0.974168694241687

In [24]:
f1

0.0