In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.style as style
import plotly as py
from plotly.offline import init_notebook_mode, iplot
import chart_studio.tools as tls
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import glob
import linearmodels

base = "C:/Users/Linchen Zhang/Desktop/fall2020/linchen_robintrack/"
indir = "C:/Users/Linchen Zhang/Desktop/fall2020/linchen_robintrack/robintrack-popularity-history/tmp/popularity_export"
clean = "C:/Users/Linchen Zhang/Desktop/fall2020/linchen_robintrack/cleanPopularity/"

In [2]:
df = pd.read_csv(base + "users_prices.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Close,Volume,Symbol,users_holding
0,0,2018-05-02,65.910004,2240500.0,A,590.0
1,1,2018-05-03,66.339996,2365900.0,A,586.0
2,2,2018-05-04,67.0,1330800.0,A,587.0
3,3,2018-05-07,67.389999,1468700.0,A,588.0
4,4,2018-05-08,67.370003,1916100.0,A,576.0


In [3]:
df['Date'] = pd.to_datetime(df['Date'])

In [4]:
df = df.set_index(['Symbol','Date'])

df.columns

Index(['Unnamed: 0', 'Close', 'Volume', 'users_holding'], dtype='object')

In [5]:
df.index

MultiIndex([(   'A', '2018-05-02'),
            (   'A', '2018-05-03'),
            (   'A', '2018-05-04'),
            (   'A', '2018-05-07'),
            (   'A', '2018-05-08'),
            (   'A', '2018-05-09'),
            (   'A', '2018-05-10'),
            (   'A', '2018-05-11'),
            (   'A', '2018-05-14'),
            (   'A', '2018-05-15'),
            ...
            ('ZYXI', '2020-07-20'),
            ('ZYXI', '2020-07-21'),
            ('ZYXI', '2020-07-22'),
            ('ZYXI', '2020-07-23'),
            ('ZYXI', '2020-07-24'),
            ('ZYXI', '2020-07-27'),
            ('ZYXI', '2020-07-28'),
            ('ZYXI', '2020-07-29'),
            ('ZYXI', '2020-07-30'),
            ('ZYXI', '2020-07-31')],
           names=['Symbol', 'Date'], length=3113966)

In [6]:
df['logClose'] = np.log(df['Close'])

# Add 1 to volume to avoid dividing by 0 issue.
df['logVolume'] = np.log(df['Volume']+1)

# Add 1 to users number to avoid dividing by 0 issue.
df['logUsers'] = np.log(df['users_holding']+1)

df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,Close,Volume,users_holding,logClose,logVolume,logUsers
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2018-05-02,0,65.910004,2240500.0,590.0,4.18829,14.62221,6.381816
A,2018-05-03,1,66.339996,2365900.0,586.0,4.194793,14.676669,6.375025
A,2018-05-04,2,67.0,1330800.0,587.0,4.204693,14.101292,6.376727
A,2018-05-07,3,67.389999,1468700.0,588.0,4.210497,14.199889,6.378426
A,2018-05-08,4,67.370003,1916100.0,576.0,4.2102,14.465803,6.357842


In [10]:
from linearmodels.panel import PanelOLS
from linearmodels.panel import PooledOLS
from linearmodels.panel import RandomEffects
from linearmodels.panel import BetweenOLS

import statsmodels.api as sm

exog_vars = ['logClose', 'logVolume']
exog = sm.add_constant(df[exog_vars])



In [11]:
# Basic PooledOLS
mod = PooledOLS(df['logUsers'], exog)
pooled_res = mod.fit()

print(pooled_res)

                          PooledOLS Estimation Summary                          
Dep. Variable:               logUsers   R-squared:                        0.4965
Estimator:                  PooledOLS   R-squared (Between):              0.5901
No. Observations:             3113966   R-squared (Within):              -0.3943
Date:                Wed, Sep 23 2020   R-squared (Overall):              0.4965
Time:                        02:45:55   Log-likelihood                -5.682e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.535e+06
Entities:                        6780   P-value                           0.0000
Avg Obs:                       459.29   Distribution:               F(2,3113963)
Min Obs:                       1.0000                                           
Max Obs:                       555.00   F-statistic (robust):          1.535e+06
                            

In [12]:
# Random Effect
mod = RandomEffects(df['logUsers'], exog)
re_res = mod.fit()

print(re_res)

                        RandomEffects Estimation Summary                        
Dep. Variable:               logUsers   R-squared:                        0.1522
Estimator:              RandomEffects   R-squared (Between):              0.1944
No. Observations:             3113966   R-squared (Within):               0.1521
Date:                Wed, Sep 23 2020   R-squared (Overall):              0.1639
Time:                        02:46:03   Log-likelihood                -2.768e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   2.796e+05
Entities:                        6780   P-value                           0.0000
Avg Obs:                       459.29   Distribution:               F(2,3113963)
Min Obs:                       1.0000                                           
Max Obs:                       555.00   F-statistic (robust):          2.798e+05
                            

In [13]:
# Between estimator
mod = BetweenOLS(df['logUsers'], exog)
be_res = mod.fit()

print(be_res)

                         BetweenOLS Estimation Summary                          
Dep. Variable:               logUsers   R-squared:                        0.6032
Estimator:                 BetweenOLS   R-squared (Between):              0.6032
No. Observations:                6780   R-squared (Within):              -0.6391
Date:                Wed, Sep 23 2020   R-squared (Overall):              0.4832
Time:                        02:46:08   Log-likelihood                -1.168e+04
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      5150.6
Entities:                        6780   P-value                           0.0000
Avg Obs:                       459.29   Distribution:                  F(2,6777)
Min Obs:                       1.0000                                           
Max Obs:                       555.00   F-statistic (robust):             5150.6
                            

In [14]:
#Fixed Effect
mod = PanelOLS(df['logUsers'], exog, entity_effects = True)
fe_res = mod.fit()

print(fe_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:               logUsers   R-squared:                        0.1521
Estimator:                   PanelOLS   R-squared (Between):              0.1816
No. Observations:             3113966   R-squared (Within):               0.1521
Date:                Wed, Sep 23 2020   R-squared (Overall):              0.1724
Time:                        02:46:12   Log-likelihood                -2.761e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   2.788e+05
Entities:                        6780   P-value                           0.0000
Avg Obs:                       459.29   Distribution:               F(2,3107184)
Min Obs:                       1.0000                                           
Max Obs:                       555.00   F-statistic (robust):          2.788e+05
                            

In [15]:
#Time Effect
mod = PanelOLS(df['logUsers'], exog, time_effects = True)
te_res = mod.fit()

print(te_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:               logUsers   R-squared:                        0.5023
Estimator:                   PanelOLS   R-squared (Between):              0.5907
No. Observations:             3113966   R-squared (Within):              -0.4006
Date:                Wed, Sep 23 2020   R-squared (Overall):              0.4964
Time:                        02:46:17   Log-likelihood                -5.634e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.571e+06
Entities:                        6780   P-value                           0.0000
Avg Obs:                       459.29   Distribution:               F(2,3113409)
Min Obs:                       1.0000                                           
Max Obs:                       555.00   F-statistic (robust):          1.571e+06
                            

In [16]:
# Both Fixed and time effects
mod = PanelOLS(df['logUsers'], exog, entity_effects = True, time_effects = True)
fe_te_res = mod.fit()

print(fe_te_res)

                          PanelOLS Estimation Summary                           
Dep. Variable:               logUsers   R-squared:                        0.0705
Estimator:                   PanelOLS   R-squared (Between):              0.1531
No. Observations:             3113966   R-squared (Within):               0.1237
Date:                Wed, Sep 23 2020   R-squared (Overall):              0.1550
Time:                        02:46:34   Log-likelihood                -2.071e+06
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                   1.179e+05
Entities:                        6780   P-value                           0.0000
Avg Obs:                       459.29   Distribution:               F(2,3106630)
Min Obs:                       1.0000                                           
Max Obs:                       555.00   F-statistic (robust):          1.179e+05
                            

In [17]:
from linearmodels.panel import compare
print(compare({'Pooled': pooled_res,'BE':be_res, 'RE':re_res, 'FE':fe_res, 'TE':te_res, 'Panel':fe_te_res}))

                                                  Model Comparison                                                  
                                Pooled             BE                RE             FE             TE          Panel
--------------------------------------------------------------------------------------------------------------------
Dep. Variable                 logUsers       logUsers          logUsers       logUsers       logUsers       logUsers
Estimator                    PooledOLS     BetweenOLS     RandomEffects       PanelOLS       PanelOLS       PanelOLS
No. Observations               3113966           6780           3113966        3113966        3113966        3113966
Cov. Est.                   Unadjusted     Unadjusted        Unadjusted     Unadjusted     Unadjusted     Unadjusted
R-squared                       0.4965         0.6032            0.1522         0.1521         0.5023         0.0705
R-Squared (Within)             -0.3943        -0.6391           