In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from glob import glob

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from collections import Counter

import re
import pickle

# Download NLTK's stopwords for data cleaning
stop = stopwords.words('english')
stop.extend('wa')

In [None]:
path = r"PATH/TO/CHANNEL/FOLDERS/"

# Make a list of all channel folders
dirs = glob(path+"*/", recursive=True)

nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = PorterStemmer()

j = len(dirs) # Used for tracking progress when building corpus below
print('Folders found:', j)

## Cleaning

In [None]:
# DEBUG CELL FOR MANUAL SELECTION
users = ['Enter users of interest here to manually select them for a quick corpus']
files = []
c = []
corpus = []

for user in users:
  files.extend(glob(path + user + "/*.csv", recursive=True))

for f in files:
    print(f)
    col = ''
    if f.endswith('.pkl'):
      c = pd.read_pickle(f)
      col = 'body'
    elif f.endswith('.xlsx'):
      c = pd.read_excel(f)
    elif f.endswith('.csv'):
      if f.endswith('.txt.csv'):
        try:
          c = pd.read_csv(f, usecols=range(3), lineterminator='\n', quoting=3)
        except:
          c = pd.read_csv(f, usecols=range(3), lineterminator='\n', quoting=3, encoding="ISO-8859-1")
        c.columns = c.columns.str.replace('\r','')
      else:
        c = pd.read_csv(f, index_col='Unnamed: 0')
      col = 'comment'
    else:
      continue
    c[col] = c[col].str.replace('\r','')
    corpus.extend(c[col])

In [None]:
# Clean files and build corpus
i = 0       # Current iteration
vocab = dict() # For data visualization
corpus = [] # For model training
skip = True # Used with debug line

for folder in dirs:
  print('================== Reading files for channel:',folder.split('\\')[-2])
  print(f' *** This is folder number {i} of {j} ({ int(i/j*100) }%)')
  files = glob(folder+"/*", recursive=True)
  c = pd.DataFrame()

  # Debug line to catch problematic files
  # if [Condition]:
  #   skip = False
  # if skip is True:
  #   continue

  # Handle various formats for Twitch chat datasets
  for cur_file in files:
    col = ''
    if cur_file.endswith('.pkl'):
      c = pd.read_pickle(cur_file)
      col = 'body'
    elif cur_file.endswith('.xlsx'):
      c = pd.read_excel(cur_file)
    elif cur_file.endswith('.csv'):
      if cur_file.endswith('.txt.csv'):
        try:
          c = pd.read_csv(cur_file, usecols=range(3), lineterminator='\n', quoting=3)
        except:
          c = pd.read_csv(cur_file, usecols=range(3), lineterminator='\n', quoting=3, encoding="ISO-8859-1")
        c.columns = c.columns.str.replace('\r','')
      else:
        c = pd.read_csv(cur_file, index_col='Unnamed: 0')
      col = 'comment'
    else:
      continue

    for msg in c[col]:
      # ADD TO CORPUS
      corpus.append(msg)

      # ADD TO VOCAB FOR DATA VISUALIZATION (commented out: done separately below)
      # try:
      #   words = str(msg).split()
      #   for word in words:
      #       if(word != x):
      #         word = x
      #       if(word not in stop and word is not np.nan):
      #         if(word in vocab):
      #           vocab[word] += 1
      #         else:
      #           vocab[word] = 1
      # except:
      #   print('Error:',words)

  # print('Vocab size:',len(vocab))
  i+=1

In [None]:
# Save cleaned data
with open('vocab.txt','wb') as f:
   pickle.dump(vocab, f)

with open('corpus.txt', 'wb') as f:
   pickle.dump(corpus, f)

In [None]:
# Load cleaned data (if necessary)
with open('vocab.txt','rb') as f:
   vocab = pickle.load(f)

with open('corpus.txt','rb') as f:
   corpus = pickle.load(f)

In [None]:
corpus = [x for x in corpus if x is not np.nan]
corpus_clean = []

# Remove stop words
for msg in corpus:
    corpus_clean.append(' '.join([word for word in str(msg).split(' ') if word not in stop]))

print(corpus_clean[:10]) # Print a sample message to ensure corpus is correctly formatted and accessible

In [None]:
def scrub_words(text):
    """Basic cleaning of texts."""
    
    # Remove HTML markup
    text=re.sub("(<.*?>)","",text)
    
    # Remove non-ascii
    text=re.sub("(\\W)"," ",text)
    
    # Remove whitespace
    text=text.strip()
    return text

# Minor cleaning before stemming for visualization
corpus_clean=[scrub_words(w) for w in corpus]

In [None]:
cleaned_stemmed_words=[]

# Stem cleaned corpus
for msg in corpus_clean:
    cleaned_stemmed_words.append(' '.join([stemmer.stem(word=word) for word in msg.split(' ')]))

In [None]:
# Compare raw vs cleaned vs stemmed
stemdf= pd.DataFrame({'raw_word': corpus,'cleaned_word':corpus_clean,'stemmed_word': cleaned_stemmed_words})
stemdf

In [None]:
# Count the number of occurrences of each word in the corpus
vocab = {}

for msg in cleaned_stemmed_words:
    words = str(msg).split(' ')
    for word in words:
        if(word not in stop and word is not np.nan):
            if(word in vocab):
                vocab[word] += 1
            else:
                vocab[word] = 1

In [None]:
# Save stemmed
with open('vocab_stem.txt', 'wb') as f:
   pickle.dump(vocab, f)
   
with open('corp_clean_stem.txt', 'wb') as f:
   pickle.dump(cleaned_stemmed_words, f)

## Charts

In [4]:
# Load stemmed (if necessary)
with open('vocab_stem.txt','rb') as f:
   vocab = pickle.load(f)

with open('corp_clean_stem.txt','rb') as f:
   corpus_clean = pickle.load(f)

In [None]:
# Save some memory (only if you didn't load the data in the cell above)
corpus_clean = cleaned_stemmed_words
del cleaned_stemmed_words

In [None]:
# Sort vocab by decreasing frequency
sorted_dict = sorted(vocab.items(), key=lambda x: x[1], reverse=True)
sorted_dict[:25]

In [None]:
# Extract the keys and values from the sorted dictionary
keys = [k for k, v in sorted_dict[:150]]
values = [v for k, v in sorted_dict[:150]]

# Use matplotlib to create a bar chart
plt.figure(figsize=(25,10))
plt.title('Most Frequently Used Words')
plt.margins(x=0)
plt.xticks(rotation=90)
plt.xlabel('Vocabulary')
plt.ylabel('Frequency')
plt.bar(keys, values, color='Purple')
plt.show()

In [None]:
# Create a word cloud
from wordcloud import WordCloud

wordcloud = WordCloud(
    background_color='white',
    stopwords=stop,
    collocations=False,
    scale=6)

def show_wordcloud(data):
    return wordcloud.generate_from_frequencies(data)

plt.figure(figsize=(24, 12))

wordcloud = show_wordcloud(vocab)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_ngram(corpus, n=None, count=10):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:count]

In [None]:
# Create plot of bigrams
top_n_bigrams=get_top_ngram(corpus_clean,n=2,count=30)
x,y=map(list,zip(*top_n_bigrams))
sns.barplot(x=y,y=x)

In [None]:
# Create plot of trigrams
top_tri_grams=get_top_ngram(corpus_clean,n=3,count=30)
x,y=map(list,zip(*top_tri_grams))
sns.barplot(x=y,y=x)

In [None]:
corpus_lens = [] # Length of messages, in characters
corpus_words = [] # Length of messages, in words

for msg in corpus:
    corpus_lens.append(len(msg))
    corpus_words.append(len(msg.split(' ')))

corpus_lens = Counter(corpus_lens)
corpus_words = Counter(corpus_words)

In [None]:
lens = { word: occurrences for word, occurrences in corpus_lens.items() if word <= 300 } # Cut off length for graph = 300 characters

In [None]:
# Plot length of chat messages, in characters
plt.figure(figsize=(25,10))
plt.title('Chat Message Lengths')
plt.margins(x=0)
plt.xticks(rotation=90)
plt.xlabel('Message Length')
plt.ylabel('Frequency')
plt.bar(lens.keys(), lens.values())
plt.show()

In [None]:
words = { word: occurrences for word, occurrences in corpus_words.items() if word <= 25 } # Cut off length for graph = 25 words

In [None]:
# Plot length of chat messages, in words
plt.figure(figsize=(25,10))
plt.title('Chat Word Counts')
plt.margins(x=0)
plt.xticks(rotation=90)
plt.xlabel('Word Count')
plt.ylabel('Frequency')
bar = plt.bar(words.keys(), words.values())
plt.bar_label(bar)
plt.show()