# Machine Learning on Quantopian

## Overview
1. Define trading universe to use ([Q500US and Q1500US](https://www.quantopian.com/posts/the-q500us-and-q1500us)).
2. Define alphas (implemented in [Pipeline](https://www.quantopian.com/tutorials/pipeline)).
3. Run pipeline.
4. Split into train and test set.
5. Preprocess data (rank alphas, subsample, align alphas with future returns, impute, scale).
6. Train Machine Learning classifier ([Random Forest from Scikit-Learn](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)).
7. Evaluate Machine Learning classifier on test set.


In [None]:
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import Latest
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.factors import CustomFactor, DailyReturns, SimpleMovingAverage, EWMA, AverageDollarVolume, Returns, RSI, VWAP
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline.filters import Q500US, Q1500US
from quantopian.pipeline.data.quandl import fred_usdontd156n as libor
from quantopian.pipeline.data.morningstar import Fundamentals

import talib
import pandas as pd
import numpy as np
from time import time
import math  

import alphalens as al
import pyfolio as pf
from scipy import stats
import matplotlib.pyplot as plt
from sklearn import grid_search, linear_model, decomposition, ensemble, preprocessing, isotonic, metrics

## Definition of some commonly used factors
The factors below are a small collection of commonly used alphas that were coded by Gil Wassermann. I will post a separate Notebook with the full collection and more descriptions of them. Ultimately we will put these into a library you can just import to avoid the wall of text. If you want to understand more about pipeline, read the [tutorial](https://www.quantopian.com/tutorials/pipeline).

Also note the `Earnings_Quality` alpha which uses [Zacks Earnings Surprises](https://www.quantopian.com/data/zacks/earnings_surprises), a [new source from our partners](https://www.quantopian.com/data).

The details of these factors are not the focus of this Notebook so feel free to just [skip](#universe) this cell.

In [None]:
bs = morningstar.balance_sheet
cfs = morningstar.cash_flow_statement
is_ = morningstar.income_statement
or_ = morningstar.operation_ratios
er = morningstar.earnings_report
v = morningstar.valuation
vr = morningstar.valuation_ratios

def make_factors():
    
    class APR(CustomFactor):  
        inputs = [USEquityPricing.close,USEquityPricing.high,USEquityPricing.low]  
        window_length = 252  
        def compute(self, today, assets, out, close, high, low):  
            hml = high - low  
            hmpc = np.abs(high - np.roll(close, 1, axis=0))  
            lmpc = np.abs(low - np.roll(close, 1, axis=0))  
            tr = np.maximum(hml, np.maximum(hmpc, lmpc))  
            atr = np.mean(tr[1:], axis=0) #skip the first one as it will be NaN  
            apr = atr / close[-1]  
            out[:] = apr  
        
    def Asset_Growth_3M():
        return Returns(inputs=[bs.total_assets], window_length=63)

    def Asset_To_Equity_Ratio():
        return bs.total_assets.latest / bs.common_stock_equity.latest

    def Capex_To_Cashflows():
        return (cfs.capital_expenditure.latest * 4.) / \
            (cfs.free_cash_flow.latest * 4.)
    
    class Downside_Volatility(CustomFactor):  
        inputs = [DailyReturns()]  
        window_length = 126  
        def compute(self, today, assets, out, returns):  
            returns[returns > 0] = np.nan  
            down_vol = np.nanstd(returns, axis = 0)  
            ann_down_vol = down_vol*math.sqrt(252)  
            out[:] = ann_down_vol
        
    def EBITDA_Yield():
        return (is_.ebitda.latest * 4.) / \
            USEquityPricing.close.latest        

    def EBIT_To_Assets():
        return (is_.ebit.latest * 4.) / \
            bs.total_assets.latest
    """
    def Earnings_Quality():
        return morningstar.cash_flow_statement.operating_cash_flow.latest / \
               EarningsSurprises.eps_act.latest
      
    
    class Fourier_Extrapolation(CustomFactor):
        inputs = [DailyReturns(window_length=80)] 
        window_length = 252
        
        def compute(self, today, assets, out, rets):
            n = rets.size
            n_predict = 10
            n_harm = 20                     # number of harmonics in model
            t = np.arange(0, n)
            try:            
                p = np.polyfit(t, rets, 1)         # find linear trend in x
                x_notrend = rets - p[0] * t        # detrended x
                x_freqdom = np.fft.fft(x_notrend)  # detrended x in frequency domain
                f = np.fft.fftfreq(n)              # frequencies
                indexes = list(range(n))
                # sort indexes by frequency, lower -> higher
                indexes.sort(key = lambda i: np.absolute(f[i]))

                t = np.arange(0, n + n_predict)
                restored_sig = np.zeros(t.size)
                for i in indexes[:1 + n_harm * 2]:
                    ampli = np.absolute(x_freqdom[i]) / n   # amplitude
                    phase = np.angle(x_freqdom[i])          # phase
                    restored_sig += ampli * np.cos(2 * np.pi * f[i] * t + phase)

                extrapolation = restored_sig + p[0] * t 
                out[:] = (1+extrapolation[-n_predict:]).cumprod()-1
            except:
                out[:] = np.nan
    """
    def Return_On_Total_Invest_Capital():
        return or_.roic.latest
    
    def Market_Cap():
        return v.market_cap.latest
    
    class Mean_Reversion_1M(CustomFactor):
        inputs = [Returns(window_length=21)]
        window_length = 252

        def compute(self, today, assets, out, monthly_rets):
            out[:] = (monthly_rets[-1] - np.nanmean(monthly_rets, axis=0)) / \
                np.nanstd(monthly_rets, axis=0)
                
    class MACD_Signal_10d(CustomFactor):
        inputs = [USEquityPricing.close]
        window_length = 60

        def compute(self, today, assets, out, close):

            sig_lines = []

            for col in close.T:
                # get signal line only
                try:
                    _, signal_line, _ = talib.MACD(col, fastperiod=12,
                                                   slowperiod=26, signalperiod=10)
                    sig_lines.append(signal_line[-1])
                # if error calculating, return NaN
                except:
                    sig_lines.append(np.nan)
            out[:] = sig_lines 
    
    class Moneyflow_Volume_5d(CustomFactor):
        inputs = [USEquityPricing.close, USEquityPricing.volume]
        window_length = 5

        def compute(self, today, assets, out, close, volume):

            mfvs = []

            for col_c, col_v in zip(close.T, volume.T):

                # denominator
                denominator = np.dot(col_c, col_v)

                # numerator
                numerator = 0.
                for n, price in enumerate(col_c.tolist()):
                    if price > col_c[n - 1]:
                        numerator += price * col_v[n]
                    else:
                        numerator -= price * col_v[n]

                mfvs.append(numerator / denominator)
            out[:] = mfvs      
           
    def Net_Income_Margin():
        return or_.net_margin.latest           

    def Operating_Cashflows_To_Assets():
        return (cfs.operating_cash_flow.latest * 4.) / \
            bs.total_assets.latest

    def Price_Momentum_3M():
        return Returns(window_length=63)
    
    class Price_Oscillator(CustomFactor):
        inputs = [USEquityPricing.close]
        window_length = 252

        def compute(self, today, assets, out, close):
            four_week_period = close[-20:]
            out[:] = (np.nanmean(four_week_period, axis=0) /
                      np.nanmean(close, axis=0)) - 1.
    
    def Returns_39W():
        return Returns(window_length=215)
    
    class Trendline(CustomFactor):
        inputs = [USEquityPricing.close]
        window_length = 252

        # using MLE for speed
        def compute(self, today, assets, out, close):

            # prepare X matrix (x_is - x_bar)
            X = range(self.window_length)
            X_bar = np.nanmean(X)
            X_vector = X - X_bar
            X_matrix = np.tile(X_vector, (len(close.T), 1)).T

            # prepare Y matrix (y_is - y_bar)
            Y_bar = np.nanmean(close, axis=0)
            Y_bars = np.tile(Y_bar, (self.window_length, 1))
            Y_matrix = close - Y_bars

            # prepare variance of X
            X_var = np.nanvar(X)

            # multiply X matrix an Y matrix and sum (dot product)
            # then divide by variance of X
            # this gives the MLE of Beta
            out[:] = (np.sum((X_matrix * Y_matrix), axis=0) / X_var) / \
                (self.window_length)
        
    class Vol_3M(CustomFactor):
        inputs = [Returns(window_length=2)]
        window_length = 63

        def compute(self, today, assets, out, rets):
            out[:] = np.nanstd(rets, axis=0)
            
    def Working_Capital_To_Assets():
        return bs.working_capital.latest / bs.total_assets.latest
    
    class CCI(CustomFactor):
        inputs = [USEquityPricing.high, USEquityPricing.low, USEquityPricing.close]
        window_length = 20
        
        def compute(self, today, assets, out, high, low, close):
            anynan = np.isnan(high).any(axis=0)
        
            # In general, it's a bad practice to iterate over numpy arrays like this in pure
            # python. Unfortunately, TALib doesn't provide us with an API to vectorize
            # operations over 2D arrays, so we're stuck with doing this.
            # A nice improvement to Zipline would be to provide a module that does this 
            # efficiently in Cython.
            for col_ix, have_nans in enumerate(anynan):

                # If we have nans in the input (e.g., because an asset didn't trade for a 
                # full day, or because the asset hasn't existed for 14 days), just forward
                # the NaN.
                if have_nans:
                    out[col_ix] = np.nan
                    continue
                    
                results = talib.CCI(
                    high[:, col_ix], 
                    low[:, col_ix], 
                    close[:, col_ix],
                    self.window_length)
                
                out[col_ix] = results[-1]
            
    class MFI(CustomFactor):  
        """  
        Money Flow Index  
        Volume Indicator  
        **Default Inputs:**  USEquityPricing.high, USEquityPricing.low, USEquityPricing.close, USEquityPricing.volume  
        **Default Window Length:** 15 (14 + 1-day for difference in prices)  
        http://www.fmlabs.com/reference/default.htm?url=MoneyFlowIndex.htm  
        """     

        inputs = [USEquityPricing.high, USEquityPricing.low, USEquityPricing.close, USEquityPricing.volume]  
        window_length = 15

        def compute(self, today, assets, out, high, low, close, vol):

            # calculate typical price  
            typical_price = (high + low + close) / 3.

            # calculate money flow of typical price  
            money_flow = typical_price * vol

            # get differences in daily typical prices  
            tprice_diff = (typical_price - np.roll(typical_price, 1, axis=0))[1:]

            # create masked arrays for positive and negative money flow  
            pos_money_flow = np.ma.masked_array(money_flow[1:], tprice_diff < 0, fill_value = 0.)  
            neg_money_flow = np.ma.masked_array(money_flow[1:], tprice_diff > 0, fill_value = 0.)

            # calculate money ratio  
            money_ratio = np.sum(pos_money_flow, axis=0) / np.sum(neg_money_flow, axis=0)

            # MFI  
            out[:] = 100. - (100. / (1. + money_ratio))  
        
    all_factors = {
        'APR_ratio': APR,
        'Asset Growth 3M': Asset_Growth_3M,
        'Asset to Equity Ratio': Asset_To_Equity_Ratio,
        'Capex to Cashflows': Capex_To_Cashflows,
        'Downside Volatility': Downside_Volatility,
        'EBIT to Assets': EBIT_To_Assets,
        'EBITDA Yield': EBITDA_Yield,        
        #'Earnings Quality': Earnings_Quality,
        'CCI': CCI,
        'Marketcap' : Market_Cap,
        'MACD Signal Line': MACD_Signal_10d,
        'Mean Reversion 1M': Mean_Reversion_1M,
        'Moneyflow Volume 5D': Moneyflow_Volume_5d,
        'Net Income Margin': Net_Income_Margin,        
        'Operating Cashflows to Assets': Operating_Cashflows_To_Assets,
        'Price Momentum 3M': Price_Momentum_3M,
        'Price Oscillator': Price_Oscillator,
        'Return on Invest Capital': Return_On_Total_Invest_Capital,
        '39 Week Returns': Returns_39W,
        'Trendline': Trendline,
        'Vol 3M': Vol_3M,
        'Working Capital to Assets': Working_Capital_To_Assets,
        'Money Flow Index' : MFI
    }        
    
    return all_factors

<a></a></a>

## Define universe and select factors to use
We will screen our universe using the new [Q1500US](https://www.quantopian.com/posts/the-q500us-and-q1500us) and hand-pick a few alphas from the list above. We encourage you to play around with the factors.

In [None]:
universe = Q1500US()

factors = make_factors()

## Define and build the pipeline
Next we have to build the pipeline. In addition to the factors defined above, we need the forward returns we want to predict. In this Notebook we will predict 5-day returns and train our model on daily data. You can also subsample the data to e.g. weekly to not have overlapping return periods but we omit this here.

In [None]:
n_fwd_days = 10 # number of days to compute returns over

In [None]:
def make_history_pipeline(factors, universe, n_fwd_days=5):
    # Call .rank() on all factors and mask out the universe
    factor_ranks = {name: f().rank(mask=universe) for name, f in factors.items()}
    # Get cumulative returns over last n_fwd_days days. We will later shift these.
    factor_ranks['CCI EMA 10-day'] = EWMA.from_span(inputs=[factor_ranks['CCI']],window_length=10,span=15)
    factor_ranks['CCI EMA 20-day'] = EWMA.from_span(inputs=[factor_ranks['CCI']],window_length=20,span=15)

    factor_ranks['Returns'] = Returns(inputs=[USEquityPricing.close],
                                      mask=universe, window_length=n_fwd_days)
    
    factor_ranks['RSI 7-Day'] = RSI(inputs=[USEquityPricing.close], window_length=7)
    
    factor_ranks['RSI 14-Day'] = RSI(inputs=[USEquityPricing.close], window_length=14)
    
    factor_ranks['VWAP 10-Day'] = VWAP(window_length=10)
    
    factor_ranks['VWAP 30-Day'] = VWAP(window_length=30)
    factor_ranks['Turnover 5d'] = SimpleMovingAverage(inputs=[USEquityPricing.volume], window_length=5) / v.shares_outstanding
    factor_ranks['Turnover 10d'] = SimpleMovingAverage(inputs=[USEquityPricing.volume], window_length=10) / v.shares_outstanding
    factor_ranks['Turnover 20d'] = SimpleMovingAverage(inputs=[USEquityPricing.volume], window_length=20) / v.shares_outstanding
    factor_ranks['Turnover 120d'] = SimpleMovingAverage(inputs=[USEquityPricing.volume], window_length=120) / v.shares_outstanding
    factor_ranks['Turnover 240d'] = SimpleMovingAverage(inputs=[USEquityPricing.volume], window_length=240) / v.shares_outstanding 
    
    pipe = Pipeline(screen=universe, columns=factor_ranks)
    
    return pipe

In [None]:
history_pipe = make_history_pipeline(factors, universe, n_fwd_days=n_fwd_days)

## Run the pipeline

In [None]:
start_timer = time()
start = pd.Timestamp("2020-06-01")
end = pd.Timestamp("2020-09-04")
results = run_pipeline(history_pipe, start_date=start, end_date=end)
results.index.names = ['date', 'security']
end_timer = time()

In [None]:
print "Time to run pipeline %.2f secs" % (end_timer - start_timer)

In [None]:
results["Future Returns"] = results.groupby(level=1)['Returns'].shift(-n_fwd_days)
results.head(100)

In [None]:
results[results.index.get_level_values(1).map(lambda x: x.symbol == 'AAPL')]['Returns']

In [None]:
# df['new column name'] = df['column name'].apply(lambda x: 'value if condition is met' if x condition else 'value if condition is not met')
results['Prediction'] = results["Future Returns"].apply(lambda x: 1 if x > 0.02 else 0 )

In [None]:
results.head()

In [None]:
results.tail()

In [None]:
# We need to remove all rows that have an NaN value
print('Before NaN drop we have {} rows and {} columns'.format(results.shape[0], results.shape[1]))

# Any row that has a 'NaN' value will be dropped
results = results.dropna()

# Display how much is left after the removal
print('After NaN drop we have {} rows and {} columns'.format(results.shape[0], results.shape[1]))

# Print the head
results.head()

In [None]:
X = results.drop(['Future Returns', 'Returns', 'Prediction'], axis=1)
y = results['Prediction']

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
print(X_train.shape, X_test.shape)

In [None]:
#set a seed so that the results are reproducible
np.random.seed(415)

#create and train model
model = ensemble.RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
X_test['Results'] = predictions
last_day = X_test.index.get_level_values(level=0).max()
last_day


In [None]:
positive_predicted = X_test.query('(date==@last_day) and (Results == 1)')
#stocks_predicted_upwards = predicted_upwards.index.get_level_values(level=1)
positive_test = pd.concat([results, positive_predicted], axis=1)
true_positive_test = positive_test.query('(date==@last_day) and (Returns >= 0.02) and (Results == 1)')['Returns']
false_positive_test = positive_test.query('(date==@last_day) and (Returns < 0.02) and (Results == 1)')['Returns']

In [None]:
len(true_positive_test)

In [None]:
len(false_positive_test)

In [None]:
cmtx = pd.DataFrame(
    metrics.confusion_matrix(y_test, predictions), 
    columns=['true:Downtrend', 'true:UpTrend'], 
    index=['predicted:Downtrend', 'predicted:UpTrend']
)

cmtx

In [None]:
metrics.accuracy_score(y_true = y_test, y_pred = predictions)

In [None]:
y_score = model.predict_proba(X_test)
y_score

In [None]:
#calculate y_score from the predicted probabilities of the positive (spy plane) class. 
y_score = model.predict_proba(X_test)[:,1]
y_score

In [None]:
#calulate false positive rate, true positive rate, and threshold
fpr, tpr, thresholds = metrics.roc_curve(y_true = y_test, y_score = y_score)

In [None]:
#calculate the roc_auc area
roc_auc = metrics.auc(fpr, tpr)

In [None]:
#plot the AUC-ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#calculate precision
metrics.precision_score(y_true=y_test, y_pred=predictions)

In [None]:
#calculate recall
metrics.recall_score(y_true=y_test, y_pred=predictions)

In [None]:
#calculate F1 score
metrics.f1_score(y_true=y_test, y_pred=predictions)

In [None]:
#make lists of the features (column names) and the feature importances returned from our model
features = X.columns
importances = list(model.feature_importances_)

In [None]:
#combine the names and importances and sort
feature_importance = list(zip(features, np.round(importances,2)))
feature_importance.sort(key=lambda x:x[1], reverse = True)
feature_importance

In [None]:
#plot the feature importances
features_ranked = list(zip(*feature_importance))[0]
feat_imp_ranked = list(zip(*feature_importance))[1]

fig = plt.figure()

x_pos = [i for i, _ in enumerate(features_ranked)]

plt.yticks(fontsize = 14)
plt.xlabel('Analysis', fontsize = 18)
plt.ylabel('Feature Importance', fontsize = 18)
plt.bar(x_pos, feat_imp_ranked);

plt.xticks(fontsize = 14, rotation=75)
plt.xticks(x_pos, features_ranked)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 3000, num = 200)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn import grid_search
np.random.seed(415)
model = ensemble.RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
model_rgs = grid_search.RandomizedSearchCV(estimator = model, param_distributions = random_grid, scoring = 'accuracy',
                               n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
model_rgs.fit(X_train, y_train)

In [None]:
#look at randomized search best parameters
model_rgs.best_params_

In [None]:
# Create a model based on the hyperparameters from the above grid search
np.random.seed(415)
model_tuned = ensemble.RandomForestClassifier(n_estimators=1603, max_depth=10, max_features='auto', 
                               min_samples_leaf=2, min_samples_split=5, bootstrap=True)
model_tuned.fit(X_train, y_train)

predictions_tuned = model_tuned.predict(X_test)

In [None]:
#metrics
precision_rgs = metrics.precision_score(y_true = y_test, y_pred = predictions_tuned)
recall_rgs = metrics.recall_score(y_true = y_test, y_pred = predictions_tuned)
accuracy_rgs = metrics.accuracy_score(y_true = y_test, y_pred = predictions_tuned)
print('Randomized Grid Search Metrics: Precision: ' + str(precision_rgs) + ' Recall: ' + str(recall_rgs) + 
      ' Accuracy: ' + str(accuracy_rgs))

In [None]:
#make lists of the features (column names) and the feature importances returned from our model
features = X.columns
importances = list(model_tuned.feature_importances_)

In [None]:
#combine the names and importances and sort
feature_importance = list(zip(features, np.round(importances,2)))
feature_importance.sort(key=lambda x:x[1], reverse = True)
feature_importance

In [None]:
#plot the feature importances
features_ranked = list(zip(*feature_importance))[0]
feat_imp_ranked = list(zip(*feature_importance))[1]

fig = plt.figure()

x_pos = [i for i, _ in enumerate(features_ranked)]

plt.yticks(fontsize = 14)
plt.xlabel('Analysis', fontsize = 18)
plt.ylabel('Feature Importance', fontsize = 18)
plt.bar(x_pos, feat_imp_ranked);

plt.xticks(fontsize = 14, rotation=75)
plt.xticks(x_pos, features_ranked)

In [None]:
#calulate false positive rate, true positive rate, and threshold
fpr, tpr, thresholds = metrics.roc_curve(y_true = y_test, y_score = y_score)

In [None]:
#calculate the roc_auc area
roc_auc = metrics.auc(fpr, tpr)

In [None]:
#plot the AUC-ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Using Machine Learning in a Pipeline

In [None]:
class RandomForestPredict(CustomFactor):
        init = False

    def compute(self, today, assets, out, returns, *inputs):
        # inputs is a list of factors, for example, assume we have 2 alpha signals, 3 stocks,
        # and a lookback of 2 days. Each element in the inputs list will be data of
        # one signal, so len(inputs) == 2. Then each element will contain a 2-D array
        # of shape [time x stocks]. For example:
        # inputs[0]:
        # [[1, 3, 2], # factor 1 rankings of day t-1 for 3 stocks  
        #  [3, 2, 1]] # factor 1 rankings of day t for 3 stocks
        # inputs[1]:
        # [[2, 3, 1], # factor 2 rankings of day t-1 for 3 stocks
        #  [1, 2, 3]] # factor 2 rankings of day t for 3 stocks
        
        if (not self.init) or (today.weekday() == 0): # Monday
            model = ensemble.RandomForestClassifier()
            model.fit(X_train, y_train)
            
            # Stack factor rankings
            X = np.dstack(inputs) # (time, stocks, factors)
            Y = returns # (time, stocks)
            
            out[:] = model.predict(X_test)[:, 1]
            
        
            # Shift data to match with future returns and binarize 
            # returns based on their 
            X, Y = shift_mask_data(X, Y, n_fwd_days=n_fwd_days)
            
            X = self.imputer.fit_transform(X)            
            X = self.scaler.fit_transform(X)
            
            # Fit the classifier
            self.clf.fit(X, Y)
            
            self.init = True

        # Predict
        # Get most recent factor values (inputs always has the full history)
        last_factor_values = get_last_values(inputs)
        last_factor_values = self.imputer.transform(last_factor_values)
        last_factor_values = self.scaler.transform(last_factor_values)

        # Predict the probability for each stock going up 
        # (column 2 of the output of .predict_proba()) and
        # return it via assignment to out.
        out[:] = self.clf.predict_proba(last_factor_values)[:, 1]

In [None]:
def make_pipeline(n_forward_days=10):
    universe = Q1500US()
    factors = make_factors()
    
    # Call .rank() on all factors and mask out the universe
    factor_ranks = {name: f().rank(mask=universe) for name, f in factors.items()}
    # Get cumulative returns over last n_fwd_days days. We will later shift these.
    factor_ranks['Returns'] = Returns(inputs=[USEquityPricing.close],
                                      mask=universe, window_length=n_fwd_days)
    
    factor_ranks['RSI 7-Day'] = RSI(inputs=[USEquityPricing.close], window_length=7) 
    factor_ranks['RSI 14-Day'] = RSI(inputs=[USEquityPricing.close], window_length=14)    
    factor_ranks['VWAP 10-Day'] = VWAP(window_length=10) 
    factor_ranks['VWAP 30-Day'] = VWAP(window_length=30)
    
    return Pipeline(screen=universe, columns=factor_ranks)
    

<a></a></a>