Library Imports

Goal: Create a data pipeline for studying the subtitles of TV shows.

In [18]:
suppressPackageStartupMessages(library(srt))
suppressPackageStartupMessages(library(tm))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(textstem))
suppressPackageStartupMessages(library(tidytext))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(dplyr))

Get the names of our files for reading.

In [19]:
show = "GilmoreGirls"

loc <- paste("./TelevisionSubs/",show,sep = "")
filenames <- list.files(path=loc,pattern="*.srt")

print(loc)
head(filenames)

[1] "./TelevisionSubs/GilmoreGirls"


Create Meta-table to aggregate data from all the episodes 

In [20]:
meta <- data.frame(
    episodeName=character(),
    fullSubtitles=character(),
    subtitleCount=numeric(),
    wordCount=numeric(),
    runTime=numeric(),
    speakingTime=numeric(),

    averageWPM=numeric(),
    stdWPM=numeric(),

    averageSubtitleLength=numeric(),
    stdSubtitleLength=numeric(),

    averageSubtitleWPM = numeric(),
    stdSubtitleWPM = numeric(),

    averageduration = numeric(),
    stdDuration = numeric(),

    stringsAsFactors = FALSE
)

Now the main function to create a file for each episode, as well as adding rows to our meta table

In [21]:
readFileByIndex <- function(i){

    # some variables parsed from filenames
    episodeCode <- str_extract(filenames[i],"[0-9]+.*?[0-9]+")
    title <- str_extract(filenames[i],"(?<=[0-9]\\s-\\s).*(?=\\.en)")
    season <- str_extract(filenames[i],"(?<=(\\s-\\s)).*?(?=x)")
    episode <- str_extract(filenames[i],"(?<=([0-9]x)).*?(?=\\s)")
    prefix <- paste(getwd(),"/TelevisionSubs/",show,"/",sep="")
     
    print(prefix)

    df <- read_srt(paste(prefix,filenames[i],sep=""))
    df <- df[-nrow(df),] #drop last row because it's credit information

    # some functions to clean up
    dropPunct <- (function (x) gsub('[[:punct:]]+','',x))
    dropNL <- (function(x) gsub("\n", " ", x))

    # create new csv for
    df <- df %>% 
        mutate(subtitle = sapply(subtitle,tolower)) %>%

        # cleanWords will clean up punctuation 
        mutate(cleanWords = sapply(subtitle,dropNL)) %>%
        mutate(cleanWords = sapply(cleanWords,removePunctuation)) %>%

        mutate(wordCount = sapply(strsplit(cleanWords,"\\s+"),length)) %>%
        mutate(cumWordCount = cumsum(wordCount)) %>%

        # deepCleanWords, for advanced processing
        # mutate(deepCleanWords = sapply(cleanWords,removeWords,stopwords('english'))) %>%
        # mutate(deepCleanWords = sapply(deepCleanWords, textstem::lemmatize_strings))
        
        mutate(minute = (1+floor(end / 60))) %>%
        mutate(duration = (end-start)/ 60) %>%
        mutate(subtitleWPM = wordCount/duration) %>% # All words / total time
        mutate(cumTimeSpeaking = cumsum(duration)) %>%
        mutate(rollingWPM = rollmean(cumWordCount/(end/60),k=5,fill=0)) 
        # mutate(rollingWPMSpeaking = rollmean(cumWordCount/cumTimeSpeaking,k=10,fill=0)) # All words / time talking

    # assemble row for Meta table
    temp <- data.frame(
        episodeName     = paste(episodeCode,' - ',title,collapse = ""),
        subtitleCount   = nrow(df),
        fullSubtitles   = paste(df$subtitle,collapse = ' '),
        totalWordCount   = sum(sapply(strsplit(paste(df$subtitle),"\\s+"), length)),
        runTime         = df$end[length(df$end)],
        speakingTime    = df$cumTimeSpeaking[length(df$cumTimeSpeaking)],

        averageWPM      = mean(df$rollingWPM, na.rm = TRUE),
        stdWPM          = sd(df$rollingWPM, na.rm = TRUE),

        averageSubtitleLength = mean(df$wordCount, na.rm = TRUE),
        stdSubtitleLength = sd(df$wordCount, na.rm = TRUE),

        averageSubtitleWPM = mean(df$subtitleWPM, na.rm = TRUE),
        stdSubtitleWPM  = sd(df$subtitleWPM, na.rm = TRUE),

        averageDuration = mean(df$duration, na.rm = TRUE),
        stdDuration     = mean(df$duration, na.rm = TRUE)
    )

    write.csv(df,paste('episode_analysis/',show,'/',episodeCode,' - ',title,sep=""))

    meta <<- rbind(meta,temp)


    return(df)
}

Check data, and save a copy to double check it in a csv reader.

In [24]:
for (i in 1:length(filenames)){
    testdf <- readFileByIndex(i)
    # if (i %% 10){
    #     print(tail(testdf,1))
    # }
}

metaSkipSubs <- meta[,!(names(meta) %in% "fullSubtitles")]
head(metaSkipSubs)

loc <- paste('./episode_analysis/',show,'/_meta.csv',sep="")
print(loc)
write.csv(meta,loc)

[1] "d:/Projects/Chattiness/TelevisionSubs/GilmoreGirls/"


"cannot open file 'episode_analysis/ GilmoreGirls / 1x01  -  Pilot': No such file or directory"


ERROR: Error in file(file, ifelse(append, "a", "w")): cannot open the connection


In [None]:
# paste(testdf$cleanWords,collapse = " ")

plot(testdf$start/60,testdf$subtitleWPM, xlab="Time (Minutes)",ylab="WPM", main='Words Per Minute in pilot',
 col='gray',ylim=c(50,500))

       lines(testdf$start/60,testdf$rollingWPM,col="blue")
       # lines(testdf$start/60,testdf$rollingWPMSpeaking,col="red")
       legend("topright", legend = c("Subtitle WPM (Instantaneous)", "Rolling WPM Total"),
              col = c("gray", "blue"), lwd = 2)

In [None]:
head(testdf[testdf$subtitleWPM >300,])