In [1]:
#==============================================================================
# 04-model-first-stage.R
# Purpose: fitting spatial following model
# Runtime: ~18 hours on NYU HPC (far more in BR)
# Author: Pablo Barbera
# Adaptation: Camila Lainetti de Morais
#==============================================================================

# setwd('your_location')
source('auxiliary_functions/functions.R')

matrixfile <- 'data/output/adj-matrix-tcc-filter-br.rdata'
outputfile <- 'data/temp/stan-fit-tcc-filter-br-new.rdata'
samplesfile <- 'data/output/samples-tcc-filter-br-new.rdata'
resultsfile <- 'data/output/results-elites-tcc-filter-br-new.rdata'
country <- 'TCC'

# loading data
load(matrixfile)

In [2]:
## starting values for elites (for identification purposes)
tcc <- read.csv("data/output/elites-data-tcc-2.csv", sep=";")

parties <- merge(
  data.frame(screen_name = colnames(y), stringsAsFactors=F),
  tcc[,c("simple_screen_name", "simple_class")], sort=FALSE, all.x=TRUE)$simple_class

start.phi <- rep(0, length(parties))
start.phi[parties == 'F'] <- -1
start.phi[parties == 'A'] <- 1

J <- dim(y)[1]

# choosing a sample of 10,000 "informative" users who follow 10 or more
# politicians, and then subsetting politicians followed by >200 of these

if (J>10000){
  J <- 10000
  inform <- which(rowSums(y)>10)
  set.seed(12345)
  subset.i <- sample(inform, J)
  y <- y[subset.i, ]
  start.phi <- start.phi[which(colSums(y)>200)]
  y <- y[,which(colSums(y)>200)]
}

Loading required package: Matrix



In [3]:
write.csv(y@Dimnames[[2]], file='test_stages/influencers.csv') # getting the influencers analised

y_matrix <- as(y, "matrix")
write.csv(y_matrix, file='test_stages/adj_matrix_smaller.csv')

In [4]:
## data for model
J <- dim(y)[1]
K <- dim(y)[2]
N <- J * K
jj <- rep(1:J, times=K)
kk <- rep(1:K, each=J)

stan.data <- list(J=J, K=K, N=N, jj=jj, kk=kk, y=c(as.matrix(y)))

## rest of starting values
colK <- colSums(y) # followers sum
rowJ <- rowSums(y) # influencers sum
normalize <- function(x){ (x-mean(x))/sd(x) }

inits <- 
  rep(
    list(
      list(
        alpha=normalize(log(colK+0.0001)), # j popularity
        sigma_alpha=1,
        
        beta=normalize(log(rowJ+0.0001)), # i interest
        mu_beta=0,
        sigma_beta=1,
        
        theta=rnorm(J), # i ideal point
        
        phi=start.phi, # j ideal point
        mu_phi=0,
        sigma_phi=1,
        
        gamma=abs(rnorm(1)) # normalization constant
      )
    )
    ,2)

In [5]:
# install.packages('rstan', dependencies = TRUE)

# # Configure the C++ toolchain
# dotR <- file.path(Sys.getenv("HOME"), ".R")
# if (!file.exists(dotR)) dir.create(dotR)
# M <- file.path(dotR, "Makevars")
# if (!file.exists(M)) file.create(M)
# cat("\nCXX14FLAGS=-O3 -march=native -mtune=native",
#     "CXX14 = g++",
#     file = M, sep = "\n", append = TRUE)


In [6]:
library(rstan)

stan.code <- '
data {
int<lower=1> J; // number of twitter users
int<lower=1> K; // number of elite twitter accounts
int<lower=1> N; // N = J x K
int<lower=1,upper=J> jj[N]; // twitter user for observation n
int<lower=1,upper=K> kk[N]; // elite account for observation n
int<lower=0,upper=1> y[N]; // dummy if user i follows elite j
}
parameters {
vector[K] alpha; // j popularity
vector[K] phi; // j ideal point
vector[J] beta; // i interest
vector[J] theta; // i ideal point
real mu_beta; // avg i interest (0)
real<lower=0.1> sigma_beta; // std deviation i interest(1)
real mu_phi; // avg ideal point j (0)
real<lower=0.1> sigma_phi; // std deviation ideal point j (1)
real<lower=0.1> sigma_alpha; // std deviation popularity j (1)
real gamma;
}
model {
alpha ~ normal(0, sigma_alpha);
beta ~ normal(mu_beta, sigma_beta);
phi ~ normal(mu_phi, sigma_phi);
theta ~ normal(0, 1); 
for (n in 1:N)
y[n] ~ bernoulli_logit( alpha[kk[n]] + beta[jj[n]] - 
gamma * square( theta[jj[n]] - phi[kk[n]] ) );
}
'
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

Loading required package: StanHeaders

Loading required package: ggplot2

code for methods in class “Rcpp_model_base” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_model_base” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_stan_fit” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_stan_fit” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

rstan (Version 2.21.5, GitRev: 2e1f913d3ca3)

For execution on a local, multicore CPU with excess RAM we recommend calling
options(mc.cores = parallel::detectCores()).
To avoid recompilation of unchanged Stan programs, we recommend calling
rstan_options(auto_write = TRUE)



In [7]:
# parameters for Stan model
n.iter <- 500 # chain interactions (2000 default)
n.warmup <- 100 # chain warmup (1000 default)
thin <- 1

## compiling model
stan.fit <- stan(model_code=stan.code, 
                 data = stan.data,
                 init=inits,
                 iter=n.iter, 
                 warmup=n.warmup, 
                 chains=2, 
                 thin=1)

## running model
stan.fit <- stan(fit=stan.fit, 
                 data = stan.data, 
                 iter=n.iter, 
                 warmup=n.warmup, 
                 thin=thin, 
                 init=inits,
                 chains=2)

save(stan.fit, file=outputfile)

## extracting and saving samples
samples <- extract(stan.fit, pars=c("alpha", "phi", "gamma", "mu_beta",
                                    "sigma_beta", "sigma_alpha"))
save(samples, file=samplesfile)

## saving estimates
results <- data.frame(
   phi = apply(samples$phi, 2, mean),
   phi.sd = apply(samples$phi, 2, sd),
   alpha = apply(samples$alpha, 2, mean),
   alpha.sd = apply(samples$alpha, 2, sd),
   stringsAsFactors=F)
 
save(results, file=resultsfile)

code for methods in class “Rcpp_stan_fit4model3c80645d1ab04_657853a628cf70c31d35851c7f35d712” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

code for methods in class “Rcpp_stan_fit4model3c80645d1ab04_657853a628cf70c31d35851c7f35d712” was not checked for suspicious field assignments (recommended package ‘codetools’ not available?)

“There were 400 transitions after warmup that exceeded the maximum treedepth. Increase max_treedepth above 10. See
“Examine the pairs() plot to diagnose sampling problems
”
“The largest R-hat is 1.77, indicating chains have not mixed.
Running the chains for more iterations may help. See
“Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
Running the chains for more iterations may help. See
“Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
Running the chains for more iterations may help. See