In [1]:
# install.packages("textstem")
# remotes::install_github("kiernann/srt")
library(srt)
library(tm)
library(tidyverse)
library(textstem)
library(tidytext)

"package 'tm' was built under R version 4.2.3"
Loading required package: NLP

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.0     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.1     [32m✔[39m [34mtibble   [39m 3.1.8
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mggplot2[39m::[32mannotate()[39m masks [34mNLP[39m::annotate()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m        masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

In [2]:
filenames <- list.files(path="./GilmoreGirls",
    pattern="*.srt")

head(filenames)

In [3]:
prefix = paste(getwd(),"/GilmoreGirls/",sep="")

In [19]:
#Create Meta table
meta = data.frame(matrix(ncol=16,nrow=0))
colnames(meta) = c('n','Episode_Code','Title','Season','Episode','Total_Words','Total_Time',
        'WPM_0th','WPM_25th','WPM_50th','WPM_75th','WPM_100th',
        'WPM_Mean','WPM_Variance','Filename','Corpus')
head(meta)

n,Episode_Code,Title,Season,Episode,Total_Words,Total_Time,WPM_0th,WPM_25th,WPM_50th,WPM_75th,WPM_100th,WPM_Mean,WPM_Variance,Filename,Corpus
<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>


In [20]:
for (i in 1:2){#length(filenames)){

    episodeCode = str_extract(filenames[i],"[0-9]+.*?[0-9]+")
    title = str_extract(filenames[i],"(?<=[0-9]\\s-\\s).*(?=\\.en)")
    season = str_extract(filenames[i],"(?<=(\\s-\\s)).*?(?=x)")
    episode = str_extract(filenames[i],"(?<=([0-9]x)).*?(?=\\s)")

    df = read_srt(paste(prefix,filenames[i],sep=""))
    df = df[-nrow(df),]

    dropPunct <- (function (x) gsub('[[:punct:] ]+',' ',x))
    dropNL <- (function(x) gsub("[\n']", " ",x))
    # countWords <- (function (x) x$cumWords)
    dropStopWords <- (function (x) removeWords(x,stopwords('english')))

    df = df %>% mutate(subtitle = lapply(subtitle,tolower)) %>%
        mutate(cleanWords = lapply(subtitle,dropNL)) %>%
        mutate(cleanWords = lapply(cleanWords,dropPunct)) %>%

        mutate(deepCleanWords = lapply(cleanWords,removePunctuation)) %>%
        mutate(deepCleanWords = lapply(deepCleanWords,removeWords,stopwords('english'))) %>%
        mutate(deepCleanWords = lapply(deepCleanWords, textstem::lemmatize_strings)) %>%

        # mutate(deepCleanWords = as.character(deepCleanWords)) %>%

        mutate(end = end/60) %>%
        mutate(cumWordCount = cumsum(sapply(cleanWords, function(x) length(unlist(strsplit(x, " ")))))) %>%
        mutate(WPM = cumWordCount/(end))

    # write.csv(df,episode)

    sum <- array(summary(df$WPM),c(1,6))

    metaRow = data.frame(
        i,episodeCode,title,season,episode,
        as.integer(tail(df,1)$cumWordCount), tail(df,1)$end,
        sum[1],sum[2],sum[3],sum[5],sum[6],sum[4],var(df$WPM),filenames[i],
        paste(df$deepCleanWords,collapse = " ")
    )

    colnames(metaRow) = c('n','Episode_Code','Title','Season','Episode','Total_Words','Total_Time',
        'WPM_0th','WPM_25th','WPM_50th','WPM_75th','WPM_100th',
        'WPM_Mean','WPM_Variance','Filename','Corpus')

    meta = rbind(meta,metaRow)
    
    #wascommented
    # print(corpus)
    # # noPunct <- tm_map(corpus,removePunctuation)
    # # noStopWords <- tm_map(noPunct,removeWords,stopwords('english'))
    # # lemmatize <- tm_map(noStopWords,lemmatize_strings)

}

In [21]:
print(meta)

  n Episode_Code                              Title Season Episode Total_Words
1 1         1x01                              Pilot      1      01        5651
2 2         1x02 The Lorelais' First Day at Chilton      1      02        6156
  Total_Time   WPM_0th WPM_25th WPM_50th WPM_75th WPM_100th WPM_Mean
1   43.29222  2.235594 120.2961 124.7192 127.6549  131.8994 119.1863
2   42.25917 52.349369 125.5234 137.7252 143.9139  147.2690 131.1509
  WPM_Variance                                                         Filename
1     315.8067                              Gilmore Girls - 1x01 - Pilot.en.srt
2     320.7577 Gilmore Girls - 1x02 - The Lorelais' First Day at Chilton.en.srt
                                                                                                                                                                                                                                                                                                                            

In [None]:
meta['WPM_Total'] = (meta$Total_Words/meta$Total_Time)
head(meta)
# tail(meta)
unique(meta$Season)

In [None]:
ggplot(data = meta, aes(x = n, y = WPM_Total, color = Season)) + 
  geom_point() +
  xlab("Episode Number") +
  ylab("Words Per Minute") +
  ggtitle("Gilmore Girls Episodes WPM vs Episode Count")

ggplot(data = meta, aes(x = n, y = WPM_Mean, color = Season)) + 
  geom_point() +
  xlab("Episode Number") +
  ylab("Words Per Minute") +
  ggtitle("Gilmore Girls Episodes AVERAGE WPM vs Episode Count")

ggplot(data = meta, aes(x = n, y = WPM_50th, color = Season)) + 
  geom_point() +
  xlab("Episode Number") +
  ylab("Words Per Minute") +
  ggtitle("Gilmore Girls Episodes AVERAGE WPM vs Episode Count")

In [None]:
#Corpus Building 
corpus <- Corpus(VectorSource(c("")))
dtm <- DocumentTermMatrix(corpus)
dtm

In [None]:
seasons = unique(meta$Season)
metaBySeason = vector("list",length(seasons))

for (i in 1:length(seasons)){
    metaBySeason[[i]] = select(subset(meta, Season == seasons[i]), n,Episode_Code,Title,Season,Episode,Filename)
}

# head(metaBySeason,3)

# for Each Season
for (season in 1:length(seasons)){
    for (episode in 1:length(metaBySeason[[season]])){
        x = filter(meta,Season==season,Episode==1)
        print(x$subtitle)
    }
}