Library Imports

Goal: Create a data pipeline for studying the subtitles of TV shows.

In [11]:
suppressPackageStartupMessages(library(srt))
suppressPackageStartupMessages(library(tm))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(textstem))
suppressPackageStartupMessages(library(tidytext))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(dplyr))

Get the names of our files for reading.

In [12]:
filenames <- list.files(path="./GilmoreGirls",pattern="*.srt")
# head(filenames)

Create Meta-table to aggregate data from all the episodes 

In [13]:
meta <- data.frame(
    episodeName=character(),
    subtitleCount=numeric(),
    wordCount=numeric(),
    runTime=numeric(),
    speakingTime=numeric(),
    averageWPM=numeric(),
    stdWPM=numeric(),
    averageSubtitleLength=numeric(),
    stdSubtitleLength=numeric(),
    fullSubtitles=character(),
    averageSubtitleWPM = numeric(),
    stdSubtitleWPM = numeric(),
    stringsAsFactors = FALSE
)

Now the main function to create a file for each episode, as well as adding rows to our meta table

In [14]:
readFileByIndex <- function(i){

    # some variables parsed from filenames
    episodeCode <- str_extract(filenames[i],"[0-9]+.*?[0-9]+")
    title <- str_extract(filenames[i],"(?<=[0-9]\\s-\\s).*(?=\\.en)")
    season <- str_extract(filenames[i],"(?<=(\\s-\\s)).*?(?=x)")
    episode <- str_extract(filenames[i],"(?<=([0-9]x)).*?(?=\\s)")
    prefix <- paste(getwd(),"/GilmoreGirls/",sep="")
     

    df <- read_srt(paste(prefix,filenames[i],sep=""))
    df <- df[-nrow(df),] #drop last row because it's credit information

    # some functions to clean up
    dropPunct <- (function (x) gsub('[[:punct:]]+','',x))
    dropNL <- (function(x) gsub("\n", " ", x))

    # create new csv for
    df <- df %>% 
        mutate(subtitle = sapply(subtitle,tolower)) %>%

        # cleanWords will clean up punctuation 
        mutate(cleanWords = sapply(subtitle,dropNL)) %>%
        mutate(cleanWords = sapply(cleanWords,removePunctuation)) %>%

        mutate(wordCount = sapply(strsplit(cleanWords,"\\s+"),length)) %>%
        mutate(cumWordCount = cumsum(wordCount)) %>%

        # deepCleanWords, for advanced processing
        # mutate(deepCleanWords = sapply(cleanWords,removeWords,stopwords('english'))) %>%
        # mutate(deepCleanWords = sapply(deepCleanWords, textstem::lemmatize_strings))
        
        mutate(minute = (1+floor(end / 60))) %>%
        mutate(subtitleLength = end-start) %>%
        mutate(subtitleWPM = wordCount/subtitleLength) %>% # All words / total time
        mutate(cumTimeSpeaking = cumsum(subtitleLength)) %>%
        mutate(rollingWPM = rollmean(cumWordCount/end,k=10,fill=0)) 
        # mutate(rollingWPMSpeaking = rollmean(cumWordCount/cumTimeSpeaking,k=10,fill=0)) # All words / time talking

    write.csv(df,paste('episode_analysis/',episodeCode,' - ',title,collapse=""))


    # assemble row for Meta table
    temp <- data.frame(
        episodeName     = paste(episodeCode,' - ',title,collapse = ""),
        subtitleCount   = length(df),
        fullSubtitles   = paste(df$subtitle,collapse = ' '),
        wordCount       = sum(sapply(strsplit(paste(df$subtitle),"\\s+"), length)),
        runTime         = df$end[length(df$end)],
        speakingTime    = df$cumTimeSpeaking[length(df$cumTimeSpeaking)],
        averageWPM      = mean(df$rollingWPM, na.rm = TRUE),
        stdWPM          = sd(df$rollingWPM, na.rm = TRUE),
        averageSubtitleLength = mean(sapply(strsplit(paste(df$subtitle),"\\s+"), length)),
        stdSubtitleLength = sd(sapply(strsplit(paste(df$subtitle),"\\s+"), length)),
        averageSubtitleWPM = mean(df$subtitleWPM),
        stdSubtitleWPM = sd(df$subtitleWPM)
    )

    meta <<- rbind(meta,temp)


    return(df)
}

Check data, and save a copy to double check it in a csv reader.

In [15]:
for (i in 1:length(filenames)){
    readFileByIndex(i)
}

metaSkipSubs <- meta[,!(names(meta) %in% "fullSubtitles")]
head(metaSkipSubs)

write.csv(meta,paste('meta.csv',collapse=""))



Unnamed: 0_level_0,episodeName,subtitleCount,wordCount,runTime,speakingTime,averageWPM,stdWPM,averageSubtitleLength,stdSubtitleLength,averageSubtitleWPM,stdSubtitleWPM
Unnamed: 0_level_1,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1x01 - Pilot,12,5443,2597.533,1722.05,1.835775,0.322224,7.969253,3.601554,2.980968,0.9143331
2,1x02 - The Lorelais' First Day at Chilton,12,5930,2535.55,1789.262,2.015317,0.355546,7.864721,3.674723,3.112435,0.9657955
3,1x03 - Kill Me Now,12,6453,2579.825,1906.31,2.526197,0.3240043,8.524439,3.525568,3.219969,0.9218293
4,1x04 - The Deer Hunters,12,6446,2623.68,1932.16,2.264309,0.3357959,8.211465,3.512532,3.182449,0.9301802
5,1x05 - Cinnamon's Wake,12,5803,2531.652,1764.392,2.165844,0.3500775,8.161744,3.628603,3.101427,0.9422581
6,1x06 - Rory's Birthday Parties,12,6335,2605.976,1882.171,2.449102,0.2969974,8.435419,3.53433,3.208177,0.9100049


In [16]:
# paste(testdf$cleanWords,collapse = " ")
plot(testdf$start/60,testdf$subtitleWPM, xlab="Time (Minutes)",ylab="WPM", main='Words Per Minute in pilot of Gilmore Girls',
 col='gray')
lines(testdf$start/60,testdf$rollingWPM,col="blue")
# lines(testdf$start/60,testdf$rollingWPMSpeaking,col="red")
legend("topright", legend = c("Subtitle WPM (Instantaneous)", "Rolling WPM Total"),
       col = c("gray", "blue"), lwd = 2)

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'plot': object 'testdf' not found
