In [89]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re

# Set the directory path for the cloned repository
directory = 'C:/Users/trick/Desktop/school and fun work/MML-Final-Project/transcripts'

# Define the list of streamers to include in the analysis
streamers = ['Xaryu', 'cyr', 'shroud', 'NICKMERCS', 'Kastaclysm', 'Symfuhny', 'Foolish_Gamers', 'Philza', 'itsRyanHiga', 'buckefps', 'Ray']

transcripts = []
for streamer in streamers:
    # Iterate over each transcript file for the current streamer label
    label_dir = os.path.join(directory, streamer)
    for file in os.listdir(label_dir):
        if file.endswith('.txt'):
            file_path = os.path.join(label_dir, file)
            with open(file_path, 'r') as f:
                transcript_text = f.read()
                # Add the transcript text and label to the list
                transcripts.append({'text': transcript_text, 'path': file})
                
df = pd.DataFrame(transcripts)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [90]:
# Converting texts to lowercase
df['text'] = df['text'].apply(lambda x: x.lower())

# remove contractions
def decontracted(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text
df['text'] = df['text'].apply(decontracted)

# remove numbers and special characters
df['text'] = df['text'].replace(to_replace = '[^a-zA-Z\s]', value = '', regex = True)

# remove multiple spaces
df['text'] = df['text'].replace(to_replace = r" +", value = ' ', regex=True)

# remove leading whitespace
df['text'] = df['text'].apply(lambda x: x.strip())

# remove stop words
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    sentence = []
    for word in text.split():
        if word not in stop_words:
            sentence.append(word)
    return ' '.join(sentence)
df['text'] = df['text'].apply(remove_stopwords)

# stemming. I use a map to keep track of common stems because it is faster to look up words in the map versus calling stem on repeated words
stemmer = SnowballStemmer('english')
common_stems = {}
def stemming(text):
    sentence = []
    for word in text.split():
        if word in common_stems:
            sentence.append(common_stems[word])
        else:
            stemmed_word = stemmer.stem(word)
            sentence.append(stemmed_word)
            common_stems[word] = stemmed_word
    return ' '.join(sentence)
df['text'] = df['text'].apply(stemming)


del common_stems
dataset = df['text'].values

In [99]:
word_counts = {}
for data in dataset:
    words = data.split()
    for word in words:
        if word in word_counts.keys():
            word_counts[word] += 1
        else:
            word_counts[word] = 1
most_common_words = sorted(word_counts, key=word_counts.get, reverse=True)[0:300]

# Remove words that aren't in the top 300 most common words from 'dataset'
for i,data in enumerate(dataset):
    words = data.split()
    temp = []
    for word in words:
        if word in most_common_words:
            temp.append(word)
    dataset[i] = ' '.join(temp)


In [101]:
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

In [106]:
# Create tf-idf vector for df['text']
tfidf = vectorizer.fit_transform(dataset)

In [109]:
df['feature'] = list(tfidf.toarray())

In [114]:
df.to_pickle('transcript_features.pkl')