In [None]:
using CSV, DataFrames, FreqTables, RCall, Plots, StatsBase
gr()

###### IL027: Interdisciplinary Computer Modelling

## Lecture 6 - Sequence Analysis

### Philippe Blanchard, Politics and International StudiesReferences

References

1. About sequence analysis:
Banchard P. 2011. "Sequence Analysis for Political Science", Working Papers of the Committee on Concepts and Methods, International Political Science Association. Retrieved from: http://concepts-methods.org/Files/WorkingPaper/PM_32_Blanchard.pdf
and/or
Blanchard P. 2019. "Sequence analysis" in Atkinson, P. A., Williams, R. A. and Cernat, A. (eds.). Encyclopaedia of Research Methods, Sage

2. About the TraMineR package used in R for sequence analysis, use the following guide:
Gabadinho A., Ritschard G., Studer M. and Müller N. 2011. Mining sequence data in R with the TraMineR package: A user's guide, Department of Econometrics and Laboratory of Demography, University of Geneva

3. For any use of original data or syntax, please cite:
Blanchard P. et al. 2013. "Gendered Trade Unions Careers: From Transitions to Turning Points?" National Research Programme on Gender Inequality, Biel, Switzerland


## 1. Preparing the session

In [None]:
## Import R libraries
R"library(TraMineR)"
R"library(foreign)"
R"library(questionr)"
R"library(RColorBrewer)"
R"library(cluster)"
R"library(graphics)"

## 2. Importing data

Use Excel-made file "Unions.csv" which contains sequence data in STS format at hand
(See Gabadinho et al. 2011: 29)

In [None]:
SwUn = CSV.read("Unions.csv",header=3,delim=",")
# Rename all a.b colums to a_b columns which is more convenient in Julia
rename!(SwUn,  replace.( String.(names(SwUn)), "." => "_")  )

In [None]:
# Retain only first 902 cases
SwUn = SwUn[1:902,:]

## 3. Exploring data

In [None]:
# Comprehensive summaries
size(SwUn)

In [None]:
SwUn[1:5,1:6]

In [None]:
names(SwUn)[1:10]

In [None]:
@show(SwUn)[1:50,4]

In [None]:
first(SwUn,2)

In [None]:
last(SwUn,2)

In [None]:
# What is this?
SwUn[SwUn[:,:q1_Sex] .== "w",:][1:5,78:82]

In [None]:
# Display last 5 years of U trajectory for all respondents born after 1992.

Birth=replace(tryparse.(Int, SwUn[:,:q2_Birth]), nothing => missing) 

@show sum(Birth[86:90,:])

A = SwUn[.! ismissing.(Birth),vcat(3,86:90)]
A[Birth[.! ismissing.(Birth)].>1992,:]

# 'tryparse' converts strings to numbers, in this case integers.
# If there are strings that aren't numbers, e.g. "na" values, these are converted to nothing values.
# 'replace' changes the nothing values into missing values, which are more convenient.
# '!' reverts the mark to those observations with positive values.

In [None]:
# ASSESSMENT 1
# Display years 1985-2010 of U trajectory for first ten male respondents born between 1995 and 2000

B=SwUn[.! ismissing.(Birth),:][
    (SwUn[.! ismissing.(Birth),:q1_Sex] .== "m") .&
    (Birth[.! ismissing.(Birth)].>1984) .& 
    (Birth[.! ismissing.(Birth)].<2001),:]
B[1:10,83:88]

## 4. Defining sequence objects

In [None]:
# Axis labels
X_lab1=range(1955, step=1, stop=2012)

In [None]:
X_lab2=range(1950, step=1, stop=2012)

In [None]:
# What does this function do?
U_lab1=R"""seqstatl($SwUn[,28:90])"""

# It retrieves all existing labels in the subset

In [None]:
# define state labels

U_lab2=["Not born","Member","Activist","Activist senior","Paid officer","Paid officer senior","Administrative","Administrative senior","no answer"]

In [None]:
U_lab3=[".","Mb","Ac","Ac+","Po","Po+","Ad","Ad+","na"]

In [None]:
# Select colours for sequences
# Can create the same colours in Julia using col2rgb in R 
# U["col"]=[[1,1,1],[255/255,182/255,193/255],[238/255,162/255,173/255],[205/255,140/255,149/255],[1,0,0],[205/255,0,0],[1,127/255,36/255],[238/255,118/255,33/255],[204/255,204/255,204/255]]
# but only need these when using R so leave as strings
U_col=["white","lightpink","lightpink2","lightpink3","red","red3","chocolate1","chocolate2","gray80"]


In [None]:
# Defining a population of sequences
U_seq1=R"seqdef($SwUn[1:902,],28:90,states=$(U_lab3),labels=$(U_lab2),xlab=$(X_lab2),cpal=$(U_col),alphabet=$U_lab3)"

In [None]:
# Basic graph
R"seqiplot($U_seq1)";

## 5. Summarizing sequence data

In [None]:
# Dimensions of the sequence object
size(U_seq1)

In [None]:
# Sequences in basic format
R"$U_seq1[1:5,]"

In [None]:
# A bit more info about the set of sequences
R"summary($U_seq1)"; #Need R macro to get details of sequence. Julia summary just returns type of RObject

In [None]:
# Other formats
R"""print($U_seq1[1:5,], format = "SPS")""";
R"""print($U_seq1[1:5,], format = "STS")""";

In [None]:
# Converting between formats
R"""seqformat($U_seq1,from="STS",to="SPS")"""

In [None]:
# Frequencies
R"seqtab($U_seq1)"

In [None]:
# Frequency by year
freqtable(SwUn,:u1995,:q1_Sex)

In [None]:
# Successive states without duration (try it with first 10 individuals)
R"seqdss($U_seq1[1:10,])"

In [None]:
# Successive durations without states (try it with first 10 individuals)
R"seqdur($U_seq1[1:10,])"

In [None]:
# States and durations per individuals
R"seqistatd($U_seq1[1:10,])"

In [None]:
# Mean time spent in each state
R"seqmeant($U_seq1)"

In [None]:
R"""seqmeant($U_seq1[$(SwUn[:,:q1_Sex].=="w"),])"""

## 6. Visualising sequences

In [None]:
# Plotting ten sequences with legend
l=(3,2)
R"""seqplot(seqdata=$U_seq1,type="i",idxs=1:10,sortv="from.end",
    main=NA,xtlab=$(X_lab2),with.legend=FALSE,ylab=NA,
    xlab="Year",cex.lab=1,cex.axis=1.2)""";
R"""seqlegend($U_seq1,ncol=2,cex=1.4,border=NA,bty="o")""";

In [None]:
# Improved graphical parameters
R"""seqplot(seqdata=$U_seq1,type="i",idxs=1:10,sortv="from.end",
    main="My great graph",xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,space=0.1,
    xlab="Year",ylab="ID",cex.lab=1,cex.axis=1.2)""";

In [None]:
# Only women
l=(1,1)
R"""seqplot(seqdata=$U_seq1[$(SwUn[:,:q1_Sex].=="w"),],type="i",idxs=0,sortv="from.end",
    main=NA,xtlab=$(X_lab2),with.legend=FALSE,border=NA,space=0,ylab=NA,
    xlab="Year",cex.lab=1,cex.axis=1.2)""";

In [None]:
# A random sample
R"""seqplot(seqdata=$U_seq1[sample(1:902,50,rep=F),],idxs=0,
    type="i",sortv="from.start",
    main="Fifty random cases",xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,space=0.1,
    xlab="Year",ylab="ID",cex.lab=1,cex.axis=1.2)""";

In [None]:
# Women born between 1958 and 1960 only
R"""seqplot(seqdata=$U_seq1[$(SwUn[:,:q2_Birth].=="1958") | $(SwUn[:,:q2_Birth].=="1959") | $(SwUn[:,:q2_Birth].=="1960") & $(SwUn[:,:q1_Sex].=="w"),],
    idxs=0,type="i",sortv="from.start",
    main="Women born between 1958 and 1960",xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,space=0.1,
    xlab="Year",ylab="ID",cex.lab=1,cex.axis=1.2)""";

In [None]:
# Other types of graphs
R"""seqplot(seqdata=$U_seq1,type="d",
    main="Distribution plot",xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,space=0,
    xlab="Year",ylab="ID",cex.lab=1,cex.axis=1.2)""";

In [None]:
R"""seqplot(seqdata=$U_seq1,type="ms",
    main=NA,xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,ylab=NA,
    xlab="Year",cex.lab=1,cex.axis=1.2)""";

## 7. Optimal matching

In [None]:
# Calculating and comparing scosts (roughly)

U_cost1=R"""(seqsubm($U_seq1,method="CONSTANT"))"""

In [None]:
U_cost2=R"""(matrix(c(
    0,0,0,0,0,0,0,0,0,
    0,0,1,2,3,4,4,4,0,
    0,1,0,1,2,3,4,4,0,
    0,2,1,0,1,2,4,4,0,
    0,3,2,1,0,1,4,4,0,
    0,4,3,2,1,0,4,3,0,
    0,4,4,4,4,4,0,2,0,
    0,4,4,4,4,3,2,0,0,
    0,0,0,0,0,0,0,0,0),
    nrow=9,ncol=9,dimnames=list($(U_lab1),$(U_lab1))))"""

In [None]:
# Calculating and comparing dissimilarities

U_dist1_1=rcopy(R"""seqdist($U_seq1,method="OM",indel=1,sm=$U_cost1)""");
round.(U_dist1_1[30:35,30:35])

In [None]:
U_dist1_2=rcopy(R"""seqdist($U_seq1,method="OM",indel=1.5,sm=$U_cost1)""");
round.(U_dist1_2[30:35,30:35])

In [None]:
(abs.(U_dist1_1[30:35,30:35] .- U_dist1_2[30:35,30:35]) ./ 
    max.(abs.(U_dist1_1[30:35,30:35]),abs.(U_dist1_2[30:35,30:35]) .+1e-15))*100

In [None]:
U_dist2=rcopy(R"""seqdist($U_seq1,method="OM",indel=1.5,sm=$U_cost2)""");
round.(U_dist2[30:35,30:35])

In [None]:
(abs.(U_dist1_1[30:35,30:35] .- U_dist2[30:35,30:35]) ./ 
    max.(abs.(U_dist1_1[30:35,30:35]),abs.(U_dist2[30:35,30:35]) .+1e-15))*100

## 8. Ordering sequences with multidimensional scaling

In [None]:
# We use dissimilarities to sort sequences with argument 'sortv'
# and function 'cmdscale':

R"""seqplot(seqdata=$U_seq1,type="i",idxs=0,sortv=cmdscale($U_dist1_1,k=1),
    main="Sorted sequences",xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,space=0,
    xlab="Year",ylab="ID",cex.lab=1,cex.axis=1.2)""";

# 'sortv' sets up a key to order the sequences
# from bottom to top of the graph.

# Interpret the graph: why this order?

# 'cmdscale' is a function for multidimensional scaling (MDS)
# that extracts the main "hidden factors" underlying the DM.
# 'k=1' indicates that we select the main factor, the one
# that would synthesise best the information contained
# in the overall sample of sequences, so as to bring a minimum
# of order into the graph.

In [None]:
# We try the same with different costs and check the change of order

R"""seqplot(seqdata=$U_seq1,type="i",idxs=0,sortv=cmdscale($U_dist1_2,k=1),
    main="Sorted sequences",xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,space=0,
    xlab="Year",ylab="ID",cex.lab=1,cex.axis=1.2)""";

In [None]:
R"""seqplot(seqdata=$U_seq1,type="i",idxs=0,sortv=cmdscale($U_dist2,k=1),
    main="Sorted sequences",xtlab=$(X_lab2),
    with.legend=FALSE,border=NA,space=0,
    xlab="Year",ylab="ID",cex.lab=1,cex.axis=1.2)""";

## 9. Clustering

In [None]:
# Clustering the dissimilarity matrix (best scosts)
U_ward=R"""agnes($U_dist2,method="ward")"""

In [None]:
# Exploring the clustering tree (try horizontal tree with readable labels)
R"""plot($U_ward,ask=F,which.plots=2)""";

In [None]:
# Selecting cuts in the tree
U_clf7=R"cutree($U_ward,k=7)"
U_clf7=rcopy(R"factor($(U_clf7),levels=c(1,2,3,4,5,6,7))")
freqtable(U_clf7)

In [None]:
U_clf4=R"cutree($U_ward,k=4)"
U_clf4=rcopy(R"factor($(U_clf4),levels=c(1,2,3,4))")
freqtable(U_clf4)

# The number of clusters corresponds to a certain vertical cut
# in the horizontal tree. The best cut is made according to
# several factors: the length of the gap between two successive nodes
# in the tree; statistics that estimate the trade-off between
# maximum contrast between clusters, and minimum contrast within clusters;
# and interpretation of the clusters, which need to make sense
# regarding the research question and your knowledge of the topic.

In [None]:
R"""seqmsplot($U_seq1,$(U_clf4),main="Cluster",xtlab=$(X_lab2),
    with.legend=T,border=NA,xlab="Year",
    cex.lab=1,cex.axis=1.2,cex.legend=1.3)""";

# Note that some of the arguments used for iplots
# are not relevant for dplots and msplots

In [None]:
# Describing clusters statistically by means of a nominal variable

U_Sex=rcopy(R"""cprop(table($(SwUn[:,:q1_Sex]),$(U_clf4)),total=T)""")

# Interpret the result
# The proportion of women is higher in clusters 3, then 1
# The proportion of party members is lower in cluster 3
# This should be interpreted with regard to the clusters'
# sequential profiles, as visible in the graphs.