In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
df = pd.read_csv('/content/reddit-comments-2015-08.csv')

In [3]:
df.head()

Unnamed: 0,body
0,I joined a new league this year and they have ...
1,"In your scenario, a person could just not run ..."
2,They don't get paid for how much time you spen...
3,"I dunno, back before the August update in an A..."
4,"No, but Toriyama sometimes would draw himself ..."


In [7]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
filename = "/content/reddit-comments-2015-08.csv"
def load_file(filename):
    with open(filename, 'r') as file:
        data = file.read().replace('\n', ' ')
    sentences = nltk.tokenize.sent_tokenize(data)
    return sentences

words=load_file(filename)
load_file(filename)

['body "I joined a new league this year and they have different scoring rules than I\'m used to.',
 "It's a slight PPR league- .2 PPR.",
 'Standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per TD thrown, and some bonuses for rec/rush/pass yardage.',
 'My question is, is it wildly clear that QB has the highest potential for points?',
 'I put in the rules at a ranking site and noticed that top QBs had 300 points more than the top RB/WR.',
 'Would it be dumb not to grab a QB in the first round?"',
 '"In your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon.',
 "There's no way to enforce it.",
 "An honest seller is going to not sell the gun to them when they see they're a felon on the background check.",
 "A dishonest seller isn't going to run the check in the first place.",
 'No one is going to be honest enough to run the check, see they\'re a felon, and then all of a sudden immediately

In [12]:
def tokenize_sentence(lines):
    tokenized_sentences = [sentence.split() for sentence in lines]
    return tokenized_sentences


tokens=tokenize_sentence(words)
tokenize_sentence(words)

[['body',
  '"I',
  'joined',
  'a',
  'new',
  'league',
  'this',
  'year',
  'and',
  'they',
  'have',
  'different',
  'scoring',
  'rules',
  'than',
  "I'm",
  'used',
  'to.'],
 ["It's", 'a', 'slight', 'PPR', 'league-', '.2', 'PPR.'],
 ['Standard',
  'besides',
  '1',
  'points',
  'for',
  '15',
  'yards',
  'receiving,',
  '.2',
  'points',
  'per',
  'completion,',
  '6',
  'points',
  'per',
  'TD',
  'thrown,',
  'and',
  'some',
  'bonuses',
  'for',
  'rec/rush/pass',
  'yardage.'],
 ['My',
  'question',
  'is,',
  'is',
  'it',
  'wildly',
  'clear',
  'that',
  'QB',
  'has',
  'the',
  'highest',
  'potential',
  'for',
  'points?'],
 ['I',
  'put',
  'in',
  'the',
  'rules',
  'at',
  'a',
  'ranking',
  'site',
  'and',
  'noticed',
  'that',
  'top',
  'QBs',
  'had',
  '300',
  'points',
  'more',
  'than',
  'the',
  'top',
  'RB/WR.'],
 ['Would',
  'it',
  'be',
  'dumb',
  'not',
  'to',
  'grab',
  'a',
  'QB',
  'in',
  'the',
  'first',
  'round?"'],
 ['"In

In [24]:
nltk.download('stopwords')
from nltk.corpus import stopwords

def remove_stopwords(tokenized_sentences):
    stop_words = set(stopwords.words('english'))
    filtered_sentences = []
    for sentence in tokenized_sentences:
        filtered_sentence = [word for word in sentence if word.lower() not in stop_words]
        filtered_sentences.append(filtered_sentence)
    return filtered_sentences

words = load_file(filename)
tokens = tokenize_sentence(words)
filtered_tokens = remove_stopwords(tokens)

filtered_tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['body',
  '"I',
  'joined',
  'new',
  'league',
  'year',
  'different',
  'scoring',
  'rules',
  "I'm",
  'used',
  'to.'],
 ['slight', 'PPR', 'league-', '.2', 'PPR.'],
 ['Standard',
  'besides',
  '1',
  'points',
  '15',
  'yards',
  'receiving,',
  '.2',
  'points',
  'per',
  'completion,',
  '6',
  'points',
  'per',
  'TD',
  'thrown,',
  'bonuses',
  'rec/rush/pass',
  'yardage.'],
 ['question',
  'is,',
  'wildly',
  'clear',
  'QB',
  'highest',
  'potential',
  'points?'],
 ['put',
  'rules',
  'ranking',
  'site',
  'noticed',
  'top',
  'QBs',
  '300',
  'points',
  'top',
  'RB/WR.'],
 ['Would', 'dumb', 'grab', 'QB', 'first', 'round?"'],
 ['"In',
  'scenario,',
  'person',
  'could',
  'run',
  'mandatory',
  'background',
  'check',
  'buyer',
  'still',
  'sell',
  'gun',
  'felon.'],
 ["There's", 'way', 'enforce', 'it.'],
 ['honest',
  'seller',
  'going',
  'sell',
  'gun',
  'see',
  "they're",
  'felon',
  'background',
  'check.'],
 ['dishonest', 'seller', 'goi

In [25]:
from sklearn.model_selection import train_test_split
labels = np.random.randint(2, size=len(filtered_tokens))

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(filtered_tokens, labels, test_size=0.3, random_state=42)


In [32]:
# Convert sentences to sequences of integers
import tensorflow as tf
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(filtered_tokens)
sequences = tokenizer.texts_to_sequences(filtered_tokens)

# Pad sequences to have the same length
maxlen = 100
padded_sequences = pad_sequences(sequences, maxlen=maxlen)

In [33]:
# Initialize Assistance Parameters
word_dim = 100
hidden_dim = 128
output_dim = 10000
bptt_truncate = 5

# Initialize Network Parameters
U = tf.Variable(tf.random.normal([word_dim, hidden_dim], stddev=0.01))
V = tf.Variable(tf.random.normal([hidden_dim, output_dim], stddev=0.01))
W = tf.Variable(tf.random.normal([hidden_dim, hidden_dim], stddev=0.01))

# Activation Function
def sigmoid(x):
    return 1 / (1 + tf.exp(-x))

# Forward Propagation
def forward_propagation(x):
    T = len(x)
    s = tf.Variable(tf.zeros([1, hidden_dim]))
    o = tf.Variable(tf.zeros([1, output_dim]))
    for t in range(T):
        u_t = tf.reshape(x[t], [1, word_dim])
        s = sigmoid(tf.matmul(u_t, U) + tf.matmul(s, W))
        o = tf.matmul(s, V)
    return o

# Calculate Loss
def calculate_loss(y, y_hat):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_hat))
    return loss

# Train the model
learning_rate = 0.01
n_epochs = 10
batch_size = 32

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

for epoch in range(n_epochs):
    epoch_loss = 0.0
    for i in range(0, len(X_train), batch_size):
        x_batch = X_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]
        with tf.GradientTape() as tape:
            y_hat = forward_propagation(x_batch)
            loss = calculate_loss(y_batch, y_hat)
        gradients = tape.gradient(loss, [U, V, W])
        optimizer.apply_gradients(zip(gradients, [U, V, W]))
        epoch_loss += loss.numpy()
    print(f"Epoch {epoch+1} loss: {epoch_loss/len(X_train)}")

# Evaluate the model
y_pred = forward_propagation(X_test)
accuracy = np.mean(np.argmax(y_pred.numpy(), axis=1) == np.argmax(y_test, axis=1))
print(f"Accuracy: {accuracy}")

InvalidArgumentError: ignored