# Set up and validate example model

In [1]:
import pandas as pd
import sklearn

In [2]:
# Path to Alex's data
# Path in network drive: /Volumes/Projects/FAIMS/txt_CV_data/evidence.txt

import_columns = [
       'Sequence','Length', 
       'Charge', 'm/z', 'Resolution',
       'Retention time','Retention length', 
       'Number of data points', 'Number of scans', 
       'MS/MS count', 'MS/MS scan number', 'Score',
       'Intensity']

evidence_df = pd.read_csv("data/evidence.txt", sep="\t", low_memory=False, na_values='NaN', usecols=import_columns)

In [3]:
# inspect dataframe
evidence_df.head()

Unnamed: 0,Sequence,Length,Charge,m/z,Resolution,Retention time,Retention length,Number of data points,Number of scans,MS/MS count,MS/MS scan number,Score,Intensity
0,AAAALCTLYHEAGQR,15.0,3.0,544.603468,156578.8,28.238,0.14698,28.0,9.0,2.0,46065.0,111.79,5540000.0
1,AAADGDRDCVLQK,13.0,3.0,473.561269,167527.5,15.991,0.066872,6.0,3.0,1.0,15006.0,58.274,1682700.0
2,AAADTLQGPMQAAYR,15.0,3.0,521.924318,162292.0,28.572,0.16761,24.0,9.0,1.0,46828.0,128.21,15624000.0
3,AAADTLQGPMQAAYR,15.0,2.0,782.382839,131258.6,28.557,0.11755,13.0,6.0,1.0,46930.0,85.807,1355500.0
4,AAAGDLGGDHLAFSCDVAK,19.0,3.0,625.62823,,29.692,1.0,,,1.0,49710.0,104.2,


In [4]:
# drop any na values, see docs: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html
evidence_df.dropna(how='any', axis=0, inplace=True)
evidence_df.head()

Unnamed: 0,Sequence,Length,Charge,m/z,Resolution,Retention time,Retention length,Number of data points,Number of scans,MS/MS count,MS/MS scan number,Score,Intensity
0,AAAALCTLYHEAGQR,15.0,3.0,544.603468,156578.8,28.238,0.14698,28.0,9.0,2.0,46065.0,111.79,5540000.0
1,AAADGDRDCVLQK,13.0,3.0,473.561269,167527.5,15.991,0.066872,6.0,3.0,1.0,15006.0,58.274,1682700.0
2,AAADTLQGPMQAAYR,15.0,3.0,521.924318,162292.0,28.572,0.16761,24.0,9.0,1.0,46828.0,128.21,15624000.0
3,AAADTLQGPMQAAYR,15.0,2.0,782.382839,131258.6,28.557,0.11755,13.0,6.0,1.0,46930.0,85.807,1355500.0
5,AAAGGLAMLTSMRPTLCSR,19.0,3.0,655.333995,142127.8,38.62,0.36855,75.0,21.0,1.0,72285.0,35.473,75076000.0


In [5]:
# display all of the dataframes columns
evidence_df.columns

Index(['Sequence', 'Length', 'Charge', 'm/z', 'Resolution', 'Retention time',
       'Retention length', 'Number of data points', 'Number of scans',
       'MS/MS count', 'MS/MS scan number', 'Score', 'Intensity'],
      dtype='object')

In [6]:
# create a list of features to subset the dataframe
feature_subset1 = ['Length', 
       'Charge', 'm/z', 'Resolution',
       'Retention time','Retention length', 
       'Number of data points', 'Number of scans', 
       'MS/MS count', 'MS/MS scan number', 'Score']
# select intensity as the target value
target_value = ['Intensity']

In [16]:
from sklearn.cross_validation import train_test_split

X = evidence_df[feature_subset1]
y = evidence_df[target_value]

# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

y1 = y1.values.ravel() #Flatten the vector for formatting reasons (transposing it)
y2 = y2.values.ravel()

In [9]:
# inspect the input matrix and 
X1.head()

Unnamed: 0,Length,Charge,m/z,Resolution,Retention time,Retention length,Number of data points,Number of scans,MS/MS count,MS/MS scan number,Score
351208,31.0,5.0,687.772455,139759.5,46.313,0.28219,17.0,16.0,1.0,94251.0,30.576
35581,15.0,2.0,880.41074,120965.1,45.27,0.40661,98.0,31.0,1.0,19723.0,56.225
278941,9.0,3.0,385.916876,185982.9,35.401,0.15066,8.0,8.0,1.0,67081.0,77.058
35925,25.0,2.0,1339.71273,95473.78,49.689,0.23584,46.0,13.0,1.0,40164.0,49.31
373255,8.0,2.0,450.748516,179507.7,16.463,0.11696,10.0,6.0,1.0,17202.0,85.265


In [10]:
# inspect the target vector
y1

array([ 2835000.,   867580.,  1913900., ..., 14247000.,   634230.,
        6212500.])

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
forest = RandomForestRegressor(200)

# fit the model on one set of data
forest.fit(X1,y1)

# evaluate the model on the second set of data
y2_model = forest.predict(X2)

In [88]:
# Worth finding various/better evaluation metrics
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y2, y2_model))

86443145.88969207

In [None]:
# Here's how to plot a predicted vs. actual plot
# see: https://scikit-learn.org/0.16/auto_examples/plot_cv_predict.html

from sklearn.cross_validation import cross_val_predict
import matplotlib.pyplot as plt

forest1 = RandomForestRegressor(200)
predicted = cross_val_predict(forest1, X2, y2, cv=2)

#lr = linear_model.LinearRegression()
#boston = datasets.load_boston()
#y = boston.target

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validated:
#predicted = cross_val_predict(lr, boston.data, y, cv=10)

fig,ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y2.min(), y2.max()], [y2.min(), y2.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
fig.show()

In [92]:
# An example of a single row/observation
X2.iloc[1]

Length                       25.000000
Charge                        4.000000
m/z                         713.612534
Resolution               136794.600000
Retention time               36.522000
Retention length              0.185180
Number of data points        32.000000
Number of scans              10.000000
MS/MS count                   1.000000
MS/MS scan number         78485.000000
Score                        58.457000
Name: 131319, dtype: float64

In [97]:
# reshape it to be consistent with training data and inspect
example_peptide_feature_matrix = (np.array(X2.iloc[1]).reshape(1, -1))
example_peptide_feature_matrix

In [96]:
# how to make a single prediction
forest.predict(example_peptide_feature_matrix)

array([15849678.])

### What Justin's data actually looks like:

In [99]:
parsed_df = pd.read_csv("Features_MaxCVs.csv")
parsed_df.head()

Unnamed: 0,Sequence,Charge,Intensity,Length.x,Max Intensity CV,A.Count,R.Count,N.Count,D.Count,C.Count,...,M.Count,F.Count,P.Count,S.Count,T.Count,W.Count,Y.Count,V.Count,U.Count,O.Count
0,AAAAAAAAAAAAAAAGAGAGAK,2,548850.0,22,25,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AAAAAAAAAAAAAAAGAGAGAK,3,7280200.0,22,45,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGP...,4,11694000.0,52,35,19,1,1,0,0,...,0,0,4,3,3,0,0,3,0,0
3,AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGP...,5,14383000.0,52,40,19,1,1,0,0,...,0,0,4,3,3,0,0,3,0,0
4,AAAAAAAAAVSR,2,14281000.0,12,60,9,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [100]:
# Available features
parsed_df.columns

Index(['Sequence', 'Charge', 'Intensity', 'Length.x', 'Max Intensity CV',
       'A.Count', 'R.Count', 'N.Count', 'D.Count', 'C.Count', 'Q.Count',
       'E.Count', 'G.Count', 'H.Count', 'I.Count', 'L.Count', 'K.Count',
       'M.Count', 'F.Count', 'P.Count', 'S.Count', 'T.Count', 'W.Count',
       'Y.Count', 'V.Count', 'U.Count', 'O.Count'],
      dtype='object')