In [1]:
# Module for machine learning for rice genome data

# Import feature extraction functions:
import Feature_Extraction as fe

# Complete rice dataset paths and import:
import pandas as pd

com_rice_path = "/Users/daveistanto/Dropbox/UIUCGraduateSchool/Researches/CROPSR/data_files/complete_rice_data.csv"
com_rice_df = pd.read_csv(com_rice_path)

In [2]:
# Filter df that has type float on "Site_30_bp" column
com_rice_df = com_rice_df[com_rice_df["Site_30_bp"].apply(lambda x: type(x) != float)]

In [3]:
# Get vectorized features from com_rice_df["Site_30_bp"]
vectorized_feats = com_rice_df["Site_30_bp"].apply(fe.ext_sgRNA_feat)

In [61]:
import warnings
warnings.filterwarnings('ignore')

# Get labels from com_rice_df
com_rice_df.sort_values(by="Predicted_target_efficiency", ascending=False, inplace=True)
com_rice_df["Top_20"] = "False"
com_rice_df["Top_20"][0:int(len(com_rice_df)*0.2)] = "True"

com_rice_df_label = com_rice_df["Top_20"]

In [62]:
# Split dataset to 80% Training 20% Test
from sklearn.model_selection import train_test_split 
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(vectorized_feats.values, com_rice_df_label.values, test_size=0.2, random_state=0)



In [63]:
# Make nestedf nd array to 2d np array
X_train = np.array(X_train.tolist())
y_train = np.array(y_train.tolist())
X_test = np.array(X_test.tolist())
y_test = np.array(y_test.tolist())


In [64]:
# SVM Machine learning, with L-1 regularization, training part

from sklearn.svm import LinearSVC

clf = LinearSVC(penalty='l2', loss='squared_hinge', dual=False)

clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)

In [65]:
# SVM Accuracies
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

from sklearn.metrics import accuracy_score

#print("Training data accuracy:", accuracy_score(y_train, pred_train))
#print("Test data accuracy:", accuracy_score(y_test, pred_test))
#print("Intercept", clf.intercept_)
#print("Coefficient / Weight Vector:", clf.coef_)

print(clf.score(X_train, y_train))
print(clf.classes_)



0.7994209039548023
['False' 'True']


In [66]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier().fit(X_train, y_train)

d_pred_train = dummy_clf.predict(X_train)
d_pred_test = dummy_clf.predict(X_test)

print(accuracy_score(d_pred_train, y_train))
print(accuracy_score(d_pred_test, y_test))



0.6795338983050847
0.681995367493362


In [67]:
print(com_rice_df.tail())




          sg_id                 SG20_PAM           Locus  Chr  Position  \
8675    1728806  ACATTAAGCGTACAAGTCTTCGG  LOC_Os01g57082    1  32994218   
52004  10774678  TCTGGTTTTGGTCCCGTATTGGG  LOC_Os06g38600    6  22870887   
25012   4591837  CGCCCCCATGGGCCTACCGTCGG  LOC_Os03g17380    3   9668629   
18585   3588567  AAAACATCAAGTGTATAGATTGG  LOC_Os02g43780    2  26413685   
58782  12390184  CAGGGTTTTGCTGGACACATAGG  LOC_Os07g36980    7  22140944   

      Strand  Relative_postion_in_locus  Predicted_target_efficiency  \
8675       +                       1758                        0.308   
52004      -                        298                        0.305   
25012      -                        439                        0.304   
18585      +                       2103                        0.304   
58782      -                        672                        0.301   

       Group_id               Oligo sequence for synthesis (5’-3’)  \
8675         76  TGATGTGTGCTCTTCTCTGTTGTCATGAT

In [68]:
# Find score correlated to label
true_scores = []
false_scores = []
for vec_index in range(len(X_train)):
    score = clf.intercept_ + np.dot(clf.coef_, X_train[vec_index])
    if d_pred_train[vec_index] == "True":
        true_scores.append(score)
        
    else:
        false_scores.append(score)
        
print("True score mean", np.sum(np.array(true_scores)))
print("False score mean", np.sum(np.array(false_scores)))


#print(clf.intercept_ + np.dot(clf.coef_,X_train[8]))

True score mean -8508.815394487247
False score mean -33887.25709935353
