In [139]:
import dask
from dask import dataframe as df

# Recommender page
# https://cseweb.ucsd.edu/~jmcauley/datasets.html#interview

# Required CSVs must be downloaded here. A Kraggle account is required
# https://www.kaggle.com/datasets/shuyangli94/interview-npr-media-dialog-transcripts

# Set CSVs paths
episodes_csv_path = '/content/drive/MyDrive/Colab Notebooks/DATA301 Project Files/episodes.csv'
utterances_csv_path = '/content/drive/MyDrive/Colab Notebooks/DATA301 Project Files/utterances.csv'

episodes_df = df.read_csv(episodes_csv_path)
utterances_df = df.read_csv(utterances_csv_path)

In [140]:
# What the episode data looks like
episodes_df.head(n=5)

Unnamed: 0,id,program,title,episode_date
0,98814,Morning Edition,Senate Ushers In New Year With 'Fiscal Cliff' ...,2013-01-01
1,98824,Morning Edition,Cheap Bubbly Or Expensive Sparkling Wine? Look...,2012-12-31
2,98821,Morning Edition,U.S. Gas Prices Reach Record Level In 2012,2013-01-01
3,98806,Morning Edition,House Approves 'Fiscal Cliff' Measure,2013-01-02
4,98823,Morning Edition,The Fiscal Cliff: A Love Story,2012-12-31


In [141]:
# What the utterance data looks like
utterances_df.head(n=5)

Unnamed: 0,episode,episode_order,speaker,utterance
0,57264,9,"Ms. LOREN MOONEY (Editor-in-Chief, Bicycling M...","It's a 2,200-mile race. To give some sense of ..."
1,57264,10,"Ms. LOREN MOONEY (Editor-in-Chief, Bicycling M...",So for a top competitor like Lance to try to m...
2,57264,11,"NEAL CONAN, host","So in every team, presumably there's one star,..."
3,57264,12,"Ms. LOREN MOONEY (Editor-in-Chief, Bicycling M...",That's right. Each team has nine riders. And w...
4,57264,13,"NEAL CONAN, host","So slipstream, this is like drafting in car ra..."


In [142]:
# Keep only necessary columns
episodes_df_cols_to_keep = ['id', 'episode_date']
utterances_df_cols_to_keep = ['episode', 'episode_order', 'utterance']
merged_df_cols_to_keep = ['episode', 'episode_order', 'episode_date', 'utterance']

# Merge data frames
merged_df = df.merge(episodes_df[episodes_df_cols_to_keep], utterances_df[utterances_df_cols_to_keep],
                     left_on='id', right_on='episode')[merged_df_cols_to_keep]

In [143]:
# Result of above
merged_df.head(n=5)

Unnamed: 0,episode,episode_order,episode_date,utterance
0,98873,1,2012-12-28,On China's Internet over the past couple of mo...
1,98873,2,2012-12-28,The most sensational Web take-down came last m...
2,98873,3,2012-12-28,(SOUNDBITE OF MUSIC)
3,98873,4,2012-12-28,"Zhu Ruifeng, who runs a muckraking website, sp..."
4,98873,5,2012-12-28,(Foreign language spoken)


In [144]:
# Filter out rows that do not contain 'gaming'
gaming_df = merged_df[merged_df['utterance'].str.contains('gaming')]

In [145]:
# Result of above
gaming_df.head(n=5)

Unnamed: 0,episode,episode_order,episode_date,utterance
3595,100592,26,2014-03-31,"Even Rooney concedes, however, that the days o..."
4426,101019,11,2012-06-29,It was not until Stapleton and I were standing...
6477,102179,25,2012-03-20,Donald Snyder is a former gaming industry exec...
8010,102899,2,2012-01-24,It's a weekday but plenty of people are sittin...
8016,102899,8,2012-01-24,Malloy is negotiating a deal that might put th...


In [146]:
# Import library to be used for filtering out irrelevant words
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

# Custom stop words
custom_stopwords = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your',
    'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
    'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
    'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
    'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now',
    'gaming', 'game', 'games', 'know', 'like', 'thats', 'people', 'think', 'dont', 'also',
    'theyre', 'tribes', 'get', 'say', 'says', 'theres', 'us', 'get', 'lot', 'years', 'video'}

stop_words = set(stopwords.words('english')).union(custom_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [147]:
# Converts strings to lowercase
def make_lower_case(utterance):
  return utterance.lower()

# Remove punctuation marks from strings
def remove_punctuation(utterance):
  return utterance.translate(str.maketrans('', '', string.punctuation))

# Remove stopwords from strings
def remove_stopwords(utterance):
  utterance_tokens = word_tokenize(utterance)
  filtered_utterance = [word for word in utterance_tokens if word not in stop_words]
  return ' '.join(filtered_utterance)

In [148]:
# Apply above functions on the utterance column to clean utterances
gaming_df['utterance'] = gaming_df['utterance'].apply(make_lower_case, meta=('string'))
gaming_df['utterance'] = gaming_df['utterance'].apply(remove_punctuation, meta=('string'))
gaming_df['utterance'] = gaming_df['utterance'].apply(remove_stopwords, meta=('string'))



In [149]:
# Convert date strings to datetime data type for simpler filtering
gaming_df['episode_date'] = df.to_datetime(gaming_df['episode_date'], format='%Y-%m-%d')

In [150]:
# Need library to convert single date to datetime data type
import pandas as pd

# Dataframe of episodes between 1999 and 2009 (inclusive)
gaming_1999_to_2009_df = gaming_df.loc[(gaming_df['episode_date'] >= pd.to_datetime('1999-01-01')) & (gaming_df['episode_date'] <= pd.to_datetime('2009-12-31'))].compute()

# Dataframe of episodes between 2010 and 2019 (inclusive)
gaming_2010_to_2019_df = gaming_df.loc[(gaming_df['episode_date'] >= pd.to_datetime('2010-01-01')) & (gaming_df['episode_date'] <= pd.to_datetime('2019-12-31'))].compute()

In [151]:
# Result of above
gaming_1999_to_2009_df.head(n=5)

Unnamed: 0,episode,episode_order,episode_date,utterance
45679,121036,4,2008-02-08,writers guild created award try raise profile ...
45693,121036,18,2008-02-08,crash titans witcher dead head fred round nomi...
49320,122811,4,2009-11-23,yeah got thinking nintendo last couple theyve ...
57424,126776,1,2006-11-13,mondays business report focuses technology tod...
58790,127475,3,2009-01-05,time weekly technology segment tech considered...


In [152]:
# Result of above
gaming_2010_to_2019_df.head(n=5)

Unnamed: 0,episode,episode_order,episode_date,utterance
3595,100592,26,2014-03-31,even rooney concedes however days greyhound ra...
4426,101019,11,2012-06-29,stapleton standing conference room tv benefit ...
6477,102179,25,2012-03-20,donald snyder former industry executive hes de...
8010,102899,2,2012-01-24,weekday plenty sitting slot machines playing t...
8016,102899,8,2012-01-24,malloy negotiating deal might put charge conne...


In [153]:
# Convert above Dask data frames to Dask bags
from dask import bag as db

gaming_utterance_1999_to_2009_bag = db.from_sequence(gaming_1999_to_2009_df['utterance'].to_list())
gaming_utterance_2010_to_2019_bag = db.from_sequence(gaming_2010_to_2019_df['utterance'].to_list())

In [154]:
# The A-Priori algorithm
def a_priori(utterance_bag, support=40):
  # Create frequency dictionary of unique words
  step_1 = dict(utterance_bag.map(lambda x: x.split(" ")[:-1]).flatten().frequencies().filter(lambda x: x[1] > support).compute())

  # Pairing words
  def find_pairs(text_line):
    pairs = []
    basket = text_line.split(" ")[:-1]
    for i in range(len(basket)):
      for j in range(i + 1, len(basket)):
        if basket[i] in step_1 and basket[j] in step_1 and basket[i] != basket[j]:
          pairs.append(tuple(sorted((basket[i], basket[j]))))
    return pairs

  # Find frequency of pairs
  step_2_frequencies = dict(utterance_bag.map(find_pairs).flatten().frequencies())

  step_2_prep = [(word_pair[0], word_pair[1], frequency) for word_pair, frequency in step_2_frequencies.items()]

  step_2 = df.from_pandas(pd.DataFrame(step_2_prep, columns=['First Word', 'Second Word', 'Frequency']), npartitions=1)
  return step_2

In [155]:
# Run the algorithm for 1999-2009
print("Frequent Words Pairs Used Alongside 'Gaming', 1999-2009")
print(a_priori(gaming_utterance_1999_to_2009_bag).compute())

Frequent Words Pairs Used Alongside 'Gaming', 1999-2009
  First Word Second Word  Frequency
0        one      really         20
1   industry         one         10
2   industry       would         12
3   industry      really         18
4        one       would         19
5     really       would         22


In [156]:
# Run the algorithm for 2010-2019
print("Frequent Words Pairs Used Alongside 'Gaming', 2010-2019")
print(a_priori(gaming_utterance_2010_to_2019_bag).compute())

Frequent Words Pairs Used Alongside 'Gaming', 2010-2019
   First Word Second Word  Frequency
0       going         one          8
1         new      online         12
2       going         new          6
3       going      online          7
4         one      really         16
5         one      online          6
6      online      really          7
7         new      really          5
8      really        well          8
9         one        well         10
10     online        well          3
11        new         one          5
12      going        well         11
13      going      really          5
14        new        well          5
