In [1]:
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline, CustomFilter, CustomFactor
from quantopian.pipeline.data import Fundamentals 
from quantopian.pipeline.data import factset
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.classifiers.fundamentals import Sector  
from quantopian.pipeline.classifiers.morningstar import Sector 
from quantopian.pipeline.filters import QTradableStocksUS, Q1500US, Q500US
from quantopian.pipeline.filters.eventvestor import IsAnnouncedAcqTarget
from quantopian.pipeline.data.psychsignal import stocktwits
from quantopian.pipeline.data.psychsignal import aggregated_twitter_withretweets_stocktwits as st
from quantopian.pipeline.data.zacks import EarningsSurprises
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.factors import Latest
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats.mstats import winsorize
from zipline.utils.numpy_utils import ( repeat_first_axis, repeat_last_axis )

In [2]:
from quantopian.pipeline.factors import ( CustomFactor, BusinessDaysSincePreviousEvent, 
                                         BusinessDaysUntilNextEvent, SimpleMovingAverage, 
                                         AverageDollarVolume, Returns, RSI, 
                                         RollingLinearRegressionOfReturns, RollingSpearmanOfReturns, 
                                         AnnualizedVolatility, Returns, DailyReturns, EWMA)

In [3]:
import talib
import pandas as pd
import numpy as np
from time import time

In [4]:
import alphalens as al
import pyfolio as pf
from scipy import stats
import scipy
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.svm import SVC, OneClassSVM, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model, decomposition, ensemble, preprocessing, isotonic, metrics
from sklearn.metrics import classification_report
from scipy.stats.mstats import gmean
from sklearn.cluster import SpectralClustering
from collections import Counter

In [5]:
WIN_LIMIT = 0.0
N_FACTOR_WINDOW = 5 
N_CLUSTERS = 5
TAU = 5
ALPHA_SMOOTH = 1-np.exp(-1.0/TAU)

In [6]:
def preprocess(a):
    
    a = np.nan_to_num(a - np.nanmean(a))
    
    a = winsorize(a, limits=[WIN_LIMIT,WIN_LIMIT])

    return preprocessing.scale(a)

In [7]:
def normalize(x):
    
    r = x - x.mean()
    denom = r.abs().sum()
    
    return r/denom  

In [8]:
def _slope(ts):
    x = np.arange(len(ts))  
    log_ts = np.log(ts)  
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, log_ts)  
    annualized_slope = (np.power(np.exp(slope), 250) - 1) 
    return annualized_slope * (r_value ** 2) 

In [9]:
MORNINGSTAR_SECTOR_CODES = {
     -1: 'Misc',
    101: 'Basic Materials',
    102: 'Consumer Cyclical',
    103: 'Financial Services',
    104: 'Real Estate',
    205: 'Consumer Defensive',
    206: 'Healthcare',
    207: 'Utilities',
    308: 'Communication Services',
    309: 'Energy',
    310: 'Industrials',
    311: 'Technology' ,    
}

In [10]:
bs = morningstar.balance_sheet
cfs = morningstar.cash_flow_statement
is_ = morningstar.income_statement
or_ = morningstar.operation_ratios
er = morningstar.earnings_report
v = morningstar.valuation
vr = morningstar.valuation_ratios
es = EarningsSurprises

In [11]:
has_data = (factset.Fundamentals.capex_assets_qf.latest.notnull()
            & factset.Fundamentals.zscore_qf.latest.notnull()
            & factset.Fundamentals.assets.latest.notnull()
            & Fundamentals.long_term_debt.latest.notnull()
            & Fundamentals.current_debt.latest.notnull()
            & Fundamentals.cash_and_cash_equivalents.latest.notnull()
            & Fundamentals.growth_score.latest.notnull()
)

In [12]:
market_cap = Fundamentals.market_cap.latest > 1e8 # Market_Cap over 100mil
is_liquid = AverageDollarVolume(window_length=21).percentile_between(90, 100)    

In [13]:
is_tradeable = (QTradableStocksUS()
                & is_liquid
                & has_data
                & market_cap)

In [14]:
def make_factors():
    
    class MessageSum(CustomFactor):
        inputs = [USEquityPricing.high, USEquityPricing.low, USEquityPricing.close, stocktwits.bull_scored_messages, stocktwits.bear_scored_messages, stocktwits.total_scanned_messages]
        window_length = 21
        window_safe = True
        def compute(self, today, assets, out, high, low, close, bull, bear, total):
            v = np.nansum((high-low)/close, axis=0)
            out[:] = preprocess(v*np.nansum(total*(bear-bull), axis=0))
                
    class fcf(CustomFactor):
        inputs = [Fundamentals.fcf_yield]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, fcf_yield):
            out[:] = preprocess(np.nan_to_num(fcf_yield[-1,:]))
                
    class mean_rev(CustomFactor):   
        inputs = [USEquityPricing.high,USEquityPricing.low,USEquityPricing.close]
        window_length = 30
        window_safe = True
        def compute(self, today, assets, out, high, low, close):
            
            p = (high+low+close)/3
 
            m = len(close[0,:])
            n = len(close[:,0])
                
            b = np.zeros(m)
            a = np.zeros(m)
                
            for k in range(10,n+1):
                price_rel = np.nanmean(p[-k:,:],axis=0)/p[-1,:]
                wt = np.nansum(price_rel)
                b += wt*price_rel
                price_rel = 1.0/price_rel
                wt = np.nansum(price_rel)
                a += wt*price_rel
                
            out[:] = preprocess(b-a)
                
    class volatility(CustomFactor):
        inputs = [USEquityPricing.high, USEquityPricing.low, USEquityPricing.close, USEquityPricing.volume]
        window_length = 5
        window_safe = True
        def compute(self, today, assets, out, high, low, close, volume):
            vol = np.nansum(volume,axis=0)*np.nansum(np.absolute((high-low)/close),axis=0)
            out[:] = preprocess(-vol)
                
    class growthscore(CustomFactor):
        inputs = [Fundamentals.growth_score]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, growth_score):
            out[:] = preprocess(growth_score[-1,:])
                
    class MoneyflowVolume5d(CustomFactor):
        inputs = (USEquityPricing.close, USEquityPricing.volume)
        window_length = 6
        window_safe = True
        def compute(self, today, assets, out, close_extra, volume_extra):
            close = close_extra[1:]
            volume = volume_extra[1:]
                
            dollar_volume = close * volume
            denominator = dollar_volume.sum(axis=0)
                
            difference = np.diff(close_extra, axis=0)
            direction = np.where(difference > 0, 1, -1)
            numerator = (direction * dollar_volume).sum(axis=0)
                
            out[:] = preprocess(-np.divide(numerator, denominator))
                
    class Trendline(CustomFactor):
        inputs = [USEquityPricing.close]
        window_length = 252
        window_safe = True
        _x = np.arange(window_length)
        _x_var = np.var(_x)
 
        def compute(self, today, assets, out, close):
            
            x_matrix = repeat_last_axis(
            (self.window_length - 1) / 2 - self._x,
            len(assets),
            )
 
            y_bar = np.nanmean(close, axis=0)
            y_bars = repeat_first_axis(y_bar, self.window_length)
            y_matrix = close - y_bars
 
            out[:] = preprocess(-np.divide(
            (x_matrix * y_matrix).sum(axis=0) / self._x_var,
            self.window_length
            ))
                
    class SalesGrowth(CustomFactor):
        inputs = [factset.Fundamentals.sales_gr_qf]
        window_length = 2*252
        window_safe = True
        def compute(self, today, assets, out, sales_growth):
            sales_growth = np.nan_to_num(sales_growth)
            sales_growth = preprocessing.scale(sales_growth,axis=0)
            out[:] = preprocess(sales_growth[-1])
 
    class GrossMarginChange(CustomFactor):
        window_length = 2*252
        inputs = [factset.Fundamentals.ebit_oper_mgn_qf]
        window_safe = True
        def compute(self, today, assets, out, ebit_oper_mgn):
            ebit_oper_mgn = np.nan_to_num(ebit_oper_mgn)
            ebit_oper_mgn = preprocessing.scale(ebit_oper_mgn,axis=0)
            out[:] = preprocess(ebit_oper_mgn[-1])
 
    class Gross_Income_Margin(CustomFactor):
        inputs = [Fundamentals.cost_of_revenue, Fundamentals.total_revenue]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, cost_of_revenue, sales):
            gross_income_margin = sales[-1]/sales[-1] - cost_of_revenue[-1]/sales[-1]
            out[:] = preprocess(-gross_income_margin)
        
    class CapEx_Vol(CustomFactor):
        inputs=[
            factset.Fundamentals.capex_assets_qf]
        window_length = 2*252
        window_safe = True
        def compute(self, today, assets, out, capex_assets):
                 
            out[:] = preprocess(-np.ptp(capex_assets,axis=0))
                
    class fcf_ev(CustomFactor):
        inputs=[
            Fundamentals.fcf_per_share,
            Fundamentals.shares_outstanding,
            Fundamentals.enterprise_value,]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, fcf, shares, ev):
            v = fcf*shares/ev
            v[np.isinf(v)] = np.nan
                 
            out[:] = preprocess(v[-1])
                               
    class TEM(CustomFactor):
        inputs=[factset.Fundamentals.capex_qf_asof_date,
            factset.Fundamentals.capex_qf,
            factset.Fundamentals.assets]
        window_length = 390
        window_safe = True
        def compute(self, today, assets, out, asof_date, capex, total_assets):
            values = capex/total_assets
            values[np.isinf(values)] = np.nan
            out_temp = np.zeros_like(values[-1,:])
            for column_ix in range(asof_date.shape[1]):
                _, unique_indices = np.unique(asof_date[:, column_ix], return_index=True)
                quarterly_values = values[unique_indices, column_ix]
                if len(quarterly_values) < 6:
                    quarterly_values = np.hstack([
                    np.repeat([np.nan], 6 - len(quarterly_values)),
                    quarterly_values,
                    ])
            
                out_temp[column_ix] = np.std(quarterly_values[-6:])
                
            out[:] = preprocess(-out_temp)
                
    class Piotroski(CustomFactor):
        inputs = [
                Fundamentals.roa,
                Fundamentals.operating_cash_flow,
                Fundamentals.cash_flow_from_continuing_operating_activities,
                Fundamentals.long_term_debt_equity_ratio,
                Fundamentals.current_ratio,
                Fundamentals.shares_outstanding,
                Fundamentals.gross_margin,
                Fundamentals.assets_turnover,
                ]
 
        window_length = 100
        window_safe = True
        def compute(self, today, assets, out,roa, cash_flow, cash_flow_from_ops, long_term_debt_ratio, current_ratio, shares_outstanding, gross_margin, assets_turnover):
            
            profit = (
                        (roa[-1] > 0).astype(int) +
                        (cash_flow[-1] > 0).astype(int) +
                        (roa[-1] > roa[0]).astype(int) +
                        (cash_flow_from_ops[-1] > roa[-1]).astype(int)
                    )
        
            leverage = (
                        (long_term_debt_ratio[-1] < long_term_debt_ratio[0]).astype(int) +
                        (current_ratio[-1] > current_ratio[0]).astype(int) + 
                        (shares_outstanding[-1] <= shares_outstanding[0]).astype(int)
                        )
        
            operating = (
                        (gross_margin[-1] > gross_margin[0]).astype(int) +
                        (assets_turnover[-1] > assets_turnover[0]).astype(int)
                        )
        
            out[:] = preprocess(profit + leverage + operating)
            
    class Altman_Z(CustomFactor):
        inputs=[factset.Fundamentals.zscore_qf]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, zscore_qf):
            out[:] = preprocess(zscore_qf[-1])
            
    class HurstExp(CustomFactor):  
        inputs = [USEquityPricing.close]  
        window_length = int(252*0.5)
        window_safe = True
        def Hurst(self, ts):   #Fast
            lags=np.arange(2,20)  
            tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]        
            n = len(lags)  
            x = np.log(lags)  
            y = np.log(tau)  
            poly = (n*(x*y).sum() - x.sum()*y.sum()) / (n*(x*x).sum() - x.sum()*x.sum())
            hurst_exp = poly*2.0
            return hurst_exp
        def compute(self, today, assets, out,  CLOSE):
            SERIES = np.log(np.nan_to_num(CLOSE)) 
            hurst_exp_per_asset = map(self.Hurst, [SERIES[:,col_id].flatten() for col_id in np.arange(SERIES.shape[1])])  
            h = np.nan_to_num(hurst_exp_per_asset)
            out[:] = preprocess(-h)
            
    class ClenowMomentum(CustomFactor):
        inputs = [USEquityPricing.close]
        window_length = 90
        window_safe = True
        def compute(self, today, assets, out, close):
            res = []
            for i in range(close.shape[1]):
                res.append(_slope(close[:, i]))
            out[:] = preprocess(res)
            
    class ItoA(CustomFactor):
        inputs = [factset.Fundamentals.ppe_gross,
              factset.Fundamentals.inven,
              factset.Fundamentals.assets]
        window_length = 270
        window_safe = True
        def compute(self, today, assets, out, ppe, inv, ta):
            ppe = np.nan_to_num(ppe)
            inv = np.nan_to_num(inv)
            out[:] = preprocess(-(ppe[-1]-ppe[0]+inv[-1]-inv[0])/ta[0])
            
    factors = [
            MessageSum,
            ItoA,
            HurstExp,
            ClenowMomentum,
            fcf,
            mean_rev,
            volatility,
            growthscore,
            MoneyflowVolume5d,
            Trendline,
            SalesGrowth,
            GrossMarginChange,
            Gross_Income_Margin,
            CapEx_Vol,
            fcf_ev,
            TEM,
            Piotroski,
            Altman_Z,  
        ]
    
    return factors

In [15]:
class Factor_N_Days_Ago(CustomFactor):
    def compute(self, today, assets, out, input_factor):
        out[:] = input_factor[0]

In [16]:
def factor_pipeline():
    
    universe = QTradableStocksUS()
    
    factors = make_factors()
    
    pipeline_columns = {}
    for k,f in enumerate(factors):
        for days_ago in range(N_FACTOR_WINDOW):
            pipeline_columns['alpha1_'+str(k)+'_'+str(days_ago)] = Factor_N_Days_Ago([f(mask=universe)], window_length=days_ago+1, mask=universe)
    
    pipe = Pipeline(columns = pipeline_columns,
    screen = universe)
    
    return pipe

In [None]:
start_timer = time()
start = pd.Timestamp("2010-03-01") 
end = pd.Timestamp("2014-03-01")
data_1 = run_pipeline(factor_pipeline(), start_date=start, end_date=end,chunksize=252)
end_timer = time()

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [None]:
print "Time to run pipeline %.2f secs" % (end_timer - start_timer)

In [None]:
asset_list = data_1.index.levels[1]
num_stocks = len(asset_list)
data_1.dropna(inplace=True)
print 'Number of stocks:', num_stocks
data_1.head()

In [None]:
def make_factors():
                
    class Quick_Ratio(CustomFactor):
        inputs=[factset.Fundamentals.quick_ratio_qf]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, quick_ratio_qf):
            out[:] = preprocess(quick_ratio_qf[-1])
                
    class AdvancedMomentum(CustomFactor):
        inputs = (USEquityPricing.close, Returns(window_length=126))
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, prices, returns):
            am = np.divide(
            (
            (prices[-21] - prices[-252]) / prices[-252] -
            prices[-1] - prices[-21]
            ) / prices[-21],
            np.nanstd(returns, axis=0)
            )
                
            out[:] = preprocess(-am)
            
    class ROA(CustomFactor):  
        inputs = [Fundamentals.roa]  
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, roa):  
            out[:] = preprocess(np.where(roa[-1]>0,1,0))
            
    class FCFTA(CustomFactor):  
        inputs = [Fundamentals.free_cash_flow,  
                 Fundamentals.total_assets]  
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, fcf, ta):  
            out[:] = preprocess(np.where(fcf[-1]/ta[-1]>0,1,0))
            
    class ROA_GROWTH(CustomFactor):  
        inputs = [Fundamentals.roa]  
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, roa):  
            out[:] = np.where(roa[-1]>roa[-252],1,0)
            
    class FCFTA_ROA(CustomFactor):  
        inputs = [Fundamentals.free_cash_flow,  
                  Fundamentals.total_assets,  
                  Fundamentals.roa]  
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, fcf, ta, roa):  
            out[:] = preprocess(np.where(fcf[-1]/ta[-1]>roa[-1],1,0))
            
    class FCFTA_GROWTH(CustomFactor):  
        inputs = [Fundamentals.free_cash_flow,  
                  Fundamentals.total_assets]  
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, fcf, ta):  
            out[:] = preprocess(np.where(fcf[-1]/ta[-1]>fcf[-252]/ta[-252],1,0))
            
    class LTD_GROWTH(CustomFactor):  
        inputs = [Fundamentals.total_assets,  
                  Fundamentals.long_term_debt]  
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, ta, ltd):  
            out[:] = preprocess(np.where(ltd[-1]/ta[-1]<ltd[-252]/ta[-252],1,0))
            
    class CR_GROWTH(CustomFactor):  
        inputs = [Fundamentals.current_ratio]  
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, cr):  
            out[:] = preprocess(np.where(cr[-1]>cr[-252],1,0))
            
    class GM_GROWTH(CustomFactor):  
        inputs = [Fundamentals.gross_margin]  
        window_length = 252  
        window_safe = True
        def compute(self, today, assets, out, gm):  
            out[:] = preprocess(np.where(gm[-1]>gm[-252],1,0))
            
    class ATR_GROWTH(CustomFactor):  
        inputs = [Fundamentals.assets_turnover]  
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, atr):  
            out[:] = preprocess(np.where(atr[-1]>atr[-252],1,0))
            
    class NEQISS(CustomFactor):  
        inputs = [Fundamentals.shares_outstanding]  
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, so):  
            out[:] = preprocess(np.where(so[-1]-so[-252]<1,1,0))
            
    class GM_GROWTH_2YR(CustomFactor):  
        inputs = [Fundamentals.gross_margin]  
        window_length = 504
        window_safe = True
        def compute(self, today, assets, out, gm):  
            out[:] = preprocess(gmean([gm[-1]+1, gm[-252]+1,gm[-504]+1])-1) 
            
    class ROA_GROWTH_2YR(CustomFactor):  
        inputs = [Fundamentals.roa]  
        window_length = 504
        window_safe = True
        def compute(self, today, assets, out, roa):  
            out[:] = preprocess(gmean([roa[-1]+1, roa[-252]+1,roa[-504]+1])-1)
            
    class ROIC_GROWTH_2YR(CustomFactor):  
        inputs = [Fundamentals.roic]  
        window_length = 504
        window_safe = True
        def compute(self, today, assets, out, roic):  
            out[:] = preprocess(gmean([roic[-1]+1, roic[-252]+1,roic[-504]+1])-1)
            
    class GM_GROWTH_8YR(CustomFactor):  
        inputs = [Fundamentals.gross_margin]  
        window_length = 8
        window_safe = True
        def compute(self, today, assets, out, gm):  
            out[:] = preprocess(gmean([gm[-1]+1, gm[-2]+1, gm[-3]+1, gm[-4]+1, gm[-5]+1, gm[-6]+1, gm[-7]+1, gm[-8]+1])-1)         
            
    class ROA_GROWTH_8YR(CustomFactor):  
        inputs = [Fundamentals.roa]  
        window_length = 9
        window_safe = True
        def compute(self, today, assets, out, roa):  
            out[:] = preprocess(gmean([roa[-1]/100+1, roa[-2]/100+1,roa[-3]/100+1,roa[-4]/100+1,roa[-5]/100+1,roa[-6]/100+1,roa[-7]/100+1,roa[-8]/100+1])-1) 
            
    class ROIC_GROWTH_8YR(CustomFactor):  
        inputs = [Fundamentals.roic]  
        window_length = 9
        window_safe = True
        def compute(self, today, assets, out, roic):  
            out[:] = preprocess(gmean([roic[-1]/100+1, roic[-2]/100+1,roic[-3]/100+1,roic[-4]/100+1,roic[-5]/100+1,roic[-6]/100+1,roic[-7]/100+1,roic[-8]/100+1])-1)              
            
    class Value(CustomFactor):
        inputs = [cfs.operating_cash_flow, v.enterprise_value] 
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, ocf, ev):
            factor_df = pd.DataFrame(index=assets)
            factor_df["ocf"] = ocf[-1]
            factor_df["ev"] = ev[-1]
            out[:] = preprocess((factor_df['ocf'] / factor_df['ev']))
            
    class GP_to_A(CustomFactor):
        inputs = [is_.gross_profit, bs.total_assets]
        window_length = 1
        window_safe = True
        def compute(self, today, assets, out, gross_profit, total_assets):       
            out[:] = preprocess(gross_profit[-1] / total_assets[-1])
            
    class efficiency_ratio(CustomFactor):    
        inputs = [USEquityPricing.close, USEquityPricing.high, USEquityPricing.low]   
        window_length = 126
        window_safe = True
        def compute(self, today, assets, out, close, high, low):
            lb = self.window_length
            e_r = np.zeros(len(assets), dtype=np.float64)
            a=np.array([high[1:(lb):1]-low[1:(lb):1],
                     abs(high[1:(lb):1]-close[0:(lb-1):1]),
                     abs(low[1:(lb):1]-close[0:(lb-1):1])])      
            b=a.T.max(axis=1)
            c=b.sum(axis=1)
            e_r=abs(close[-1]-close[0]) / c  
            out[:] = preprocess(e_r)
            
    class Price_Oscillator(CustomFactor):
        inputs = [USEquityPricing.close]
        window_length = 252
        window_safe = True
        def compute(self, today, assets, out, close):
            four_week_period = close[-20:]
            out[:] = preprocess((np.nanmean(four_week_period, axis=0) /
                      np.nanmean(close, axis=0)) - 1.)
            
    factors = [
            Quick_Ratio,
            Price_Oscillator,
            efficiency_ratio,
            GP_to_A,
            Value,
            AdvancedMomentum,
            ROA,  
            FCFTA,  
            ROA_GROWTH,  
            FCFTA_ROA,  
            FCFTA_GROWTH,  
            LTD_GROWTH,  
            CR_GROWTH,  
            GM_GROWTH,  
            ATR_GROWTH,  
            NEQISS,  
            GM_GROWTH_2YR,  
            ROA_GROWTH_2YR,  
            ROIC_GROWTH_2YR,  
            ROA_GROWTH_8YR,  
            ROIC_GROWTH_8YR,  
        ]
    
    return factors

In [None]:
def factor_pipeline():
    
    universe = QTradableStocksUS()
    
    factors = make_factors()
        
    pipeline_columns = {}
    for k,f in enumerate(factors):
        for days_ago in range(N_FACTOR_WINDOW):
            pipeline_columns['alpha2_'+str(k)+'_'+str(days_ago)] = Factor_N_Days_Ago([f(mask=universe)], window_length=days_ago+1, mask=universe)
    
    pipeline_columns['Sector'] = Sector()
    
    pipeline_columns['5D_Returns'] = Returns(inputs = [USEquityPricing.close],
                                      mask = universe, window_length = 5)
    
    pipe = Pipeline(columns = pipeline_columns,
    screen = universe)
    
    return pipe

In [None]:
start_timer = time()
data_2 = run_pipeline(factor_pipeline(), start_date=start, end_date=end,chunksize=252)
end_timer = time()

In [None]:
print "Time to run pipeline %.2f secs" % (end_timer - start_timer)

In [None]:
data_2.head()

In [None]:
data_20 = data_2.drop('Sector', axis=1)
data_20 = data_20.drop('5D_Returns', axis=1)

In [None]:
data_20.head()

In [None]:
df_a = pd.concat([data_20, data_1], axis=1)
df_a.head()

In [None]:
alphas = df_a.dropna()

In [None]:
n_factors = len(alphas.columns)/N_FACTOR_WINDOW
n_stocks = len(alphas.index)

In [None]:
alphas_flattened = np.zeros((n_factors,n_stocks*N_FACTOR_WINDOW))

In [None]:
for f in range(n_factors):
        a = alphas.iloc[:,f*N_FACTOR_WINDOW:(f+1)*N_FACTOR_WINDOW].values
        alphas_flattened[f,:] = np.ravel(a)

In [None]:
clustering = SpectralClustering(n_clusters=N_CLUSTERS,assign_labels="discretize",random_state=0).fit(alphas_flattened)

In [None]:
weights = np.zeros(n_factors)
for k,w in enumerate(clustering.labels_):
    weights[k] = Counter(clustering.labels_)[w]

In [None]:
alphas_current = alphas.ix[:,::N_FACTOR_WINDOW]

In [None]:
combined_alpha = pd.Series(np.zeros_like(alphas_current.iloc[:,1].values),index=alphas_current.index)
for k in range(n_factors):
    combined_alpha += alphas_current.iloc[:,k]/weights[k]

In [None]:
combined_alpha_bsc = combined_alpha
combined_alpha_bsc.head()

In [None]:
combined_alpha = normalize(combined_alpha)
combined_alpha = (1-ALPHA_SMOOTH)*combined_alpha
combined_alpha = combined_alpha.add(ALPHA_SMOOTH*combined_alpha,fill_value=0).dropna()
combined_alpha = normalize(combined_alpha)

In [None]:
combined_alpha.head()

In [None]:
alpha_df = pd.DataFrame({'combined_alpha': combined_alpha,
                         'combined_alphaz': combined_alpha_bsc})
s_1 = data_2['Sector']
alpha_df.loc[:, 'Sector'] = s_1
s_2 = data_2['5D_Returns']
alpha_df.loc[:, '5D_Returns'] = s_2
alpha_df['Sector_Name'] = alpha_df['Sector'].map(MORNINGSTAR_SECTOR_CODES)
alpha_df.head()

In [None]:
pricing = get_pricing(asset_list, start, end + pd.Timedelta(days=30), fields="close_price")
stock_rets = pricing.pct_change()

In [None]:
alpha_df['CA_decile']=pd.qcut(alpha_df['combined_alpha'],10,labels=False)+1
alpha_df['RET_decile']=pd.qcut(alpha_df['5D_Returns'],10,labels=False)+1

In [None]:
CA=alpha_df.groupby('CA_decile')['5D_Returns'].apply(lambda x: x.mean())
CA.plot(kind='bar', color='blue',  position=0, width=0.5,label='combined_alpha')
plt.xlabel('Decile')
plt.ylabel('Average 5D_Returns')
plt.legend(loc='best');

In [None]:
sector_averagesz = alpha_df.groupby('Sector_Name')['combined_alpha'].apply(lambda x: x.mean())
sector_averagesz.plot(kind='bar', color='blue',  position=0, width=0.5,label='combined_alpha_z')
plt.ylabel('Alpha Factor Return')
plt.legend(loc='best');

In [None]:
sector_averagesb = alpha_df.groupby('Sector_Name')['5D_Returns'].apply(lambda x: x.mean())
plt.ylabel('5D_Returns')
sector_averagesb.plot(kind='bar', color='red',  position=1, width=0.5,label='5D_Returns')
plt.legend(loc='best');

In [None]:
rets0 = alpha_df['5D_Returns']
alpha = alpha_df['combined_alpha']
alphaz = alpha_df['combined_alphaz']

In [None]:
print(alpha.describe().loc[['mean', 'std', 'min', 'max']])

In [None]:
import seaborn as sns
sns.distplot(alpha);

In [None]:
r_s = stats.spearmanr(alpha, rets0)
print 'Correlation Coefficient: ' + str(r_s[0])
print 'p-value: ' + str(r_s[1])

In [None]:
df1 = alpha_df.drop('Sector_Name', axis=1)
df1.index = df1.index.droplevel(1)
df1.dropna()
df1.head(2)

In [None]:
x = sm.add_constant(rets0, prepend=False)
ols = sm.OLS(alphaz, x).fit()
beta = ols.params
y_fit = [x.min().dot(beta), x.max().dot(beta)]

In [None]:
ols.summary2()

In [None]:
i = df1
cm = plt.get_cmap('jet')
colors = np.linspace(0.1, 1, len(alpha_df))
sc = plt.scatter(rets0, alphaz, s=50, c=colors, cmap=cm, 
                 edgecolor='k', alpha=0.7, label='Price Data')
plt.plot([x.min()[0], x.max()[0]], y_fit, 'black', linestyle='--', linewidth=1, label='OLS Fit')
plt.legend()
cb = plt.colorbar(sc)
cb.ax.set_yticklabels([str(p.date()) for p in i[::len(i)//9].index])
plt.xlabel('5D_Returns')
plt.ylabel('combined_alpha');

In [None]:
from quantopian.research.experimental import get_factor_returns, get_factor_loadings
import empyrical as ep
import pyfolio as pf

In [None]:
factor1_data = al.utils.get_clean_factor_and_forward_returns(
    factor=alpha_df["combined_alpha"],
    prices=pricing,
    groupby=alpha_df["Sector"],
    quantiles=5,
    periods=(1, 5, 10, 21)
)

In [None]:
sector_labels = dict(Sector.SECTOR_NAMES)
sector_labels[-1] = "Unknown" 

In [None]:
factor_loadings = get_factor_loadings(asset_list, start, end)
factor_returns = get_factor_returns(start, end)

In [None]:
def plot_ic_over_time(factor_data, label='', ax=None):
    mic = al.performance.mean_information_coefficient(factor_data)
    mic.index = mic.index.map(lambda x: int(x[:-1])) 
    ax = mic.plot(label=label, ax=ax)
    ax.set(xlabel='Days', ylabel='Mean IC')
    ax.legend()
    ax.axhline(0, ls='--', color='k')

In [None]:
def compute_specific_returns(total_returns, factor_returns=None, factor_loadings=None, assets=None):
    if assets is not None:
        factor_loadings = get_factor_loadings(assets, start, end + pd.Timedelta(days=30))
        factor_returns = get_factor_returns(start, end + pd.Timedelta(days=30))
    elif factor_loadings is None or factor_returns is None:
        raise ValueError('Supply either assets or factor_returns and factor_loadings')
    
    factor_returns.index = factor_returns.index.set_names(['dt'])
    factor_loadings.index = factor_loadings.index.set_names(['dt', 'ticker'])
    common_returns = factor_loadings.mul(factor_returns).sum(axis='columns').unstack()
    specific_returns = total_returns - common_returns
    return specific_returns

In [None]:
stock_rets_specific = compute_specific_returns(stock_rets, factor_returns, factor_loadings)
cr_specific = ep.cum_returns(stock_rets_specific, starting_value=1)

In [None]:
factor_data_specific1 = al.utils.get_clean_factor_and_forward_returns(
    alpha_df["combined_alpha"], 
    cr_specific,
    periods=range(1, 21))

In [None]:
def factor_portfolio_returns(factor, pricing, equal_weight=True, delay=0):
    if equal_weight:
        factor = np.sign(factor)
        bins = (-1, 0, 1)
        quantiles = None
        zero_aware = False
    else:
        bins = None
        quantiles = 5
        zero_aware = True
        
    pos = factor.unstack().fillna(0)
    # Factor might not be daily, get trading index from pricing data and ffill
    pos = (pos / (pos.abs().sum())).reindex(pricing.index).ffill().shift(delay)
    # Fully invested, shorts show up as cash
    pos['cash'] = pos[pos < 0].sum(axis='columns')
    
    factor_and_returns = al.utils.get_clean_factor_and_forward_returns(
        pos.stack().loc[lambda x: x != 0], 
        pricing, periods=(1,), quantiles=quantiles, bins=bins, 
        zero_aware=zero_aware)
    
    return al.performance.factor_returns(factor_and_returns)['1D'], pos

portfolio_returns, portfolio_pos = factor_portfolio_returns(alpha_df["combined_alpha"], pricing, 
                                                             equal_weight=True)

In [None]:
factor_loadings.index = factor_loadings.index.set_names(['dt', 'ticker'])
portfolio_pos.index = portfolio_pos.index.set_names(['dt'])
risk_exposures_portfolio, perf_attribution = pf.perf_attrib.perf_attrib(
    portfolio_returns, 
    portfolio_pos, 
    factor_returns, 
    factor_loadings, 
    pos_in_dollars=False)

In [None]:
factor1_returns, factor1_positions, factor1_benchmark = \
    al.performance.create_pyfolio_input(factor1_data,
                                        period='5D',
                                        capital=10000000,
                                        long_short=True,
                                        group_neutral=False,
                                        equal_weight=True,
                                        quantiles=[1,5],
                                        groups=None,
                                        benchmark_period='1D')

In [None]:
asset_list = factor1_data.index.levels[1].unique()
start_date = factor1_data.index.levels[0].min()
end_date   = factor1_data.index.levels[0].max()
factor_loadings.index.names = ['dt', 'ticker']

In [None]:
plot_ic_over_time(factor1_data, label='combined_alpha_n_IC')

In [None]:
plot_ic_over_time(factor1_data, label='combined_alpha_n Total returns')
plot_ic_over_time(factor_data_specific1, label='combined_alpha_n Specific returns')

In [None]:
def plot_exposures(risk_exposures, ax=None):
    rep = risk_exposures.stack().reset_index()
    rep.columns = ['dt', 'factor', 'exposure']
    sns.boxplot(x='exposure', y='factor', data=rep, orient='h', ax=ax, order=risk_exposures.columns[::-1])
    
plot_exposures(risk_exposures_portfolio)

In [None]:
ep.cum_returns_final(perf_attribution).plot.barh()
plt.xlabel('cumulative returns');

In [None]:
perf_attribution.apply(ep.annual_volatility).plot.barh()
plt.xlabel('Ann. volatility');

In [None]:
def plot_cum_returns_delay(factor, pricing, delay=range(5), ax=None):
    if ax is None:
        fig, ax = plt.subplots()
    for d in delay:
        portfolio_returns, _ = factor_portfolio_returns(alpha_df["combined_alpha"], pricing, delay=d)
        ep.cum_returns(portfolio_returns).plot(ax=ax, label=d)
    ax.legend()
    ax.set(ylabel='Cumulative returns', title='Cumulative returns if factor is delayed')
    
plot_cum_returns_delay(alpha_df["combined_alpha"], pricing)

In [None]:
factor1_returns.plot()
plt.ylabel('Returns')
plt.legend(['Factor1']);

In [None]:
pf.tears.create_perf_attrib_tear_sheet(factor1_returns,
                                       positions=factor1_positions,
                                       factor_returns=factor_returns,
                                       factor_loadings=factor_loadings,      
                                       pos_in_dollars=True)

In [None]:
my_factor1 = alpha_df['combined_alpha']
sectors = alpha_df['Sector']
prices = pricing
periods = (1,3,5,10,21)

factor_data1 = al.utils.get_clean_factor_and_forward_returns(factor=my_factor1,
                                                            prices=prices,
                                                            groupby=sectors,
                                                            groupby_labels=MORNINGSTAR_SECTOR_CODES,
                                                            periods=periods,
                                                            quantiles = 5)

In [None]:
al.tears.create_full_tear_sheet(factor_data1, by_group=True);

In [None]:
al.tears.create_information_tear_sheet(factor_data1)

In [None]:
al.tears.create_event_returns_tear_sheet(factor_data=factor_data1,
                                                        prices=prices,
                                                        avgretplot=(5, 20),
                                                        long_short=True,
                                                        by_group=True);

In [None]:
factset.Fundamentals.bps_gr_af
my_factor = factset.Fundamentals.earn_yld_af