# APS360 - Classifying Subreddits

Bassam Bibi<br>
Matthew Kwan

## Imports

In [3]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

## Load Data

In [6]:
df = pd.read_csv('./data/removed_b_datav3.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,subreddit,titles,post,score,url
0,0,0,1,history,"'New ""Discovery Mode"" turns video game ""Assass...",,53786,https://www.theverge.com/2018/2/20/17033024/as...
1,1,1,2,history,'We are not here to help you with your End of ...,,38419,https://www.reddit.com/r/history/comments/8pw3...
2,2,2,3,history,"""A 1776 excerpt from John Adam's diary where h...",,35984,https://founders.archives.gov/documents/Adams/...
3,3,3,4,history,'Famous Viking warrior burial revealed to be t...,,34919,http://www.news.com.au/technology/science/arch...
4,4,4,5,history,"""3,000-year-old underwater castle discovered i...",,34196,https://inhabitat.com/3000-year-old-underwater...


In [7]:
glove = torchtext.vocab.GloVe(name="6B", dim=50, max_vectors=10000) 

# We will be using an embedding layer so we can first store the index of each word in a PyTorch tensor:

.vector_cache\glove.6B.zip: 862MB [06:36, 2.17MB/s]                                                                    
  2%|█▎                                                                       | 7495/400000 [00:00<00:10, 36160.05it/s]


## Preliminary Model

In [None]:
class RedditLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RedditLSTM, self).__init__()
        self.emb = nn.Embedding.from_pretrained(glove.vectors)
        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state and cell state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        c0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the LSTM
        out, _ = self.rnn(x, (h0, c0))
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out