# Set up and validate example model

In [None]:
import pandas as pd
import sklearn

In [None]:
# Path to Alex's data
# Path in network drive: /Volumes/Projects/FAIMS/txt_CV_data/evidence.txt

import_columns = [
       'Sequence','Length', 
       'Charge', 'm/z', 'Resolution',
       'Retention time','Retention length', 
       'Number of data points', 'Number of scans', 
       'MS/MS count', 'MS/MS scan number', 'Score',
       'Intensity']

evidence_df = pd.read_csv("data/evidence.txt", sep="\t", low_memory=False, na_values='NaN', usecols=import_columns)

In [None]:
evidence_df.head()

In [None]:
evidence_df.dropna(how='any', axis=0, inplace=True)
evidence_df.head()

In [None]:
evidence_df.columns

In [None]:
feature_subset1 = ['Length', 
       'Charge', 'm/z', 'Resolution',
       'Retention time','Retention length', 
       'Number of data points', 'Number of scans', 
       'MS/MS count', 'MS/MS scan number', 'Score']
target_value = ['Intensity']

In [87]:
from sklearn.cross_validation import train_test_split

X = evidence_df[feature_subset1]
y = evidence_df[target_value]

# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

y1 = y1.values.ravel() #Flatten the vector for formatting reasons
y2 = y2.values.ravel()

In [None]:
X1.head()
y1.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
forest = RandomForestRegressor(200)

# fit the model on one set of data
forest.fit(X1,y1)

# evaluate the model on the second set of data
y2_model = forest.predict(X2)

In [None]:
y1.head()

In [None]:
y1.head().values.ravel()

In [None]:
len(y2_model)

In [None]:
len(y2.values.ravel())

In [88]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y2, y2_model))

86443145.88969207

In [None]:
from sklearn.cross_validation import cross_val_predict
import matplotlib.pyplot as plt

forest1 = RandomForestRegressor(200)
predicted = cross_val_predict(forest1, X2, y2, cv=10)

#lr = linear_model.LinearRegression()
#boston = datasets.load_boston()
#y = boston.target

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validated:
#predicted = cross_val_predict(lr, boston.data, y, cv=10)

fig,ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y2.min(), y2.max()], [y2.min(), y2.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
fig.show()

In [None]:
forest.predict(X2)

In [91]:
X2.head()

Unnamed: 0,Length,Charge,m/z,Resolution,Retention time,Retention length,Number of data points,Number of scans,MS/MS count,MS/MS scan number,Score
371312,14.0,3.0,609.617443,147275.0,42.445,0.13478,20.0,7.0,1.0,83754.0,59.864
131319,25.0,4.0,713.612534,136794.6,36.522,0.18518,32.0,10.0,1.0,78485.0,58.457
213274,13.0,2.0,762.946464,131336.8,27.053,0.083527,12.0,4.0,1.0,48031.0,79.07
115250,17.0,2.0,956.463404,135959.9,25.032,0.13435,21.0,7.0,1.0,49376.0,36.944
148382,8.0,2.0,513.21428,166812.0,14.345,0.13456,20.0,7.0,1.0,16554.0,124.98


In [92]:
X2.iloc[1]

Length                       25.000000
Charge                        4.000000
m/z                         713.612534
Resolution               136794.600000
Retention time               36.522000
Retention length              0.185180
Number of data points        32.000000
Number of scans              10.000000
MS/MS count                   1.000000
MS/MS scan number         78485.000000
Score                        58.457000
Name: 131319, dtype: float64

In [97]:
example_peptide_feature_matrix = (np.array(X2.iloc[1]).reshape(1, -1))

In [98]:
example_peptide_feature_matrix

array([[2.50000000e+01, 4.00000000e+00, 7.13612534e+02, 1.36794600e+05,
        3.65220000e+01, 1.85180000e-01, 3.20000000e+01, 1.00000000e+01,
        1.00000000e+00, 7.84850000e+04, 5.84570000e+01]])

In [96]:
forest.predict

array([15849678.])

### What Justin's data actually looks like:

In [99]:
parsed_df = pd.read_csv("Features_MaxCVs.csv")
parsed_df.head()

Unnamed: 0,Sequence,Charge,Intensity,Length.x,Max Intensity CV,A.Count,R.Count,N.Count,D.Count,C.Count,...,M.Count,F.Count,P.Count,S.Count,T.Count,W.Count,Y.Count,V.Count,U.Count,O.Count
0,AAAAAAAAAAAAAAAGAGAGAK,2,548850.0,22,25,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AAAAAAAAAAAAAAAGAGAGAK,3,7280200.0,22,45,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGP...,4,11694000.0,52,35,19,1,1,0,0,...,0,0,4,3,3,0,0,3,0,0
3,AAAAAAAAAAAATGTEAGPGTAGGSENGSEVAAQPAGLSGPAEVGP...,5,14383000.0,52,40,19,1,1,0,0,...,0,0,4,3,3,0,0,3,0,0
4,AAAAAAAAAVSR,2,14281000.0,12,60,9,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [100]:
parsed_df.columns

Index(['Sequence', 'Charge', 'Intensity', 'Length.x', 'Max Intensity CV',
       'A.Count', 'R.Count', 'N.Count', 'D.Count', 'C.Count', 'Q.Count',
       'E.Count', 'G.Count', 'H.Count', 'I.Count', 'L.Count', 'K.Count',
       'M.Count', 'F.Count', 'P.Count', 'S.Count', 'T.Count', 'W.Count',
       'Y.Count', 'V.Count', 'U.Count', 'O.Count'],
      dtype='object')