### Reddit Topic Modelling Project

This project is another personal project of mine.

In [None]:
'''
These commands have been run in the JupyterLab console:

pip install praw 

pip install stop_words

pip install nltk

pip install sklearn

NOTE: We install these modules specifically using pip here because Conda version is
outdated. A seperate Conda environment was created to avoid package management issues.
'''

In [None]:
#import the following packages for the following praw program in the next cell
import praw as praw
import random  
import socket
import sys
#DONE#

#import the following packages for creating a corpus

import string #import string module for string manipulation
import stop_words #import base stop_words 
import nltk #import nltk for removing extra stop words and tokenising strings for text preprocessor function

from stop_words import get_stop_words #(About 900 stop words)
from nltk.corpus import stopwords #(An extra 150 stop words)
from nltk.tokenize import word_tokenize #(tokeniser function)

'''
nltk.download('stopwords')
nltk.download('punkt')

Download the stopwords and punkt resource for nltk if necessary
'''
#DONE

In [None]:
'''
Code taken from praw documentation to initiate program to obtain refresh token for Reddit API
authorisation. This is necessary to avoid using personal username and password for
authentication.
'''
def receive_connection():
    """Wait for and then return a connected socket..

    Opens a TCP connection on port 8080, and waits for a single client.

    """
    server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    server.bind(("localhost", 8080))
    server.listen(1)
    client = server.accept()[0]
    server.close()
    return client


def send_message(client, message):
    """Send message to client and close the connection."""
    print(message)
    client.send(f"HTTP/1.1 200 OK\r\n\r\n{message}".encode("utf-8"))
    client.close()


def main():
    """Provide the program's entry point when directly executed."""
    print(
        "Go here while logged into the account you want to create a token for: "
        "https://www.reddit.com/prefs/apps/"
    )
    print(
        "Click the create an app button. Put something in the name field and select the"
        " script radio button."
    )
    print("Put http://localhost:8080 in the redirect uri field and click create app")
    client_id = input(
        "Enter the client ID, it's the line just under Personal use script at the top: "
    )
    client_secret = input("Enter the client secret, it's the line next to secret: ")
    commaScopes = input(
        "Now enter a comma separated list of scopes, or all for all tokens: "
    )

    if commaScopes.lower() == "all":
        scopes = ["*"]
    else:
        scopes = commaScopes.strip().split(",")

    reddit = praw.Reddit(
        client_id=client_id.strip(),
        client_secret=client_secret.strip(),
        redirect_uri="http://localhost:8080",
        user_agent="praw_refresh_token_example",
    )
    state = str(random.randint(0, 65000))
    url = reddit.auth.url(scopes, state, "permanent")
    print(f"Now open this url in your browser: {url}")
    sys.stdout.flush()

    client = receive_connection()
    data = client.recv(1024).decode("utf-8")
    param_tokens = data.split(" ", 2)[1].split("?", 1)[1].split("&")
    params = {
        key: value for (key, value) in [token.split("=") for token in param_tokens]
    }

    if state != params["state"]:
        send_message(
            client,
            f"State mismatch. Expected: {state} Received: {params['state']}",
        )
        return 1
    elif "error" in params:
        send_message(client, params["error"])
        return 1

    refresh_token = reddit.auth.authorize(params["code"])
    send_message(client, f"Refresh token: {refresh_token}")
    return 0


if __name__ == "__main__":
    sys.exit(main())
    

In [None]:
reddit = praw.Reddit(client_id = "", 
                                   client_secret = "",
                                  user_agent = "",
                                  refresh_token = ""
                                  )

#Authentication credentials used above to get access to Reddit API
#Insert generated refresh token from the previous cell into the empty parameter

In [None]:
print(reddit.user.me()) #check that the user login worked correctly

In [None]:
subreddit = reddit.subreddit("") #Set the subreddit of choice to stream Reddit post submissions. 

In [None]:
x = subreddit.new(limit = 1000) #Create variable that gathers new submissions from subreddit, input integer to limit posts

In [None]:
sub_list = [] #Create empty list to hold submission titles aka sub_list. *change variable name if required*

In [None]:
#Define a function that extracts submission titles from Reddit API and appends into a list.

def streamer():

    for submission in x:
        sub_list.append(submission.title)
        print(submission.title)

In [None]:
streamer() #call streamer function to create list containing submission titles

In [None]:
stop_words = set(stopwords.words('english') + get_stop_words('en'))#create stop words object from nltk stopwords resource

punc = ".?=+%()-_|/[]!:;@<>&^*!'',#" #variable for punctuation to be omitted 

In [None]:
#create function to prepare text for language processing

#list_2_str is the list that is to be converted into a string

def string_preprocessor(list_2_str):
    
        list_2_str = str(list_2_str) #convert list into string format

        list_2_str = list_2_str.lower() #change all text in string to lowercase format
        
        list_2_str = list_2_str.strip() #remove all whitespace from string

        for punctuation in list_2_str:
            if punctuation in punc:
                list_2_str = list_2_str.replace(punctuation, "") #remove punctuation from string
        
        return(list_2_str)

In [None]:
sub_list = string_preprocessor(sub_list) #input your list to be converted to string here

In [None]:
sub_tokens = word_tokenize(sub_list)

In [None]:
sub_tokens = [word for word in sub_tokens if word.isalpha()]

In [None]:
sub_tokens = [word for word in sub_tokens if not word in stop_words]

In [None]:
corpus = sub_tokens

del(sub_tokens)
del(sub_list)

In [None]:
#This cell will write a text document containing all the terms taken from the subreddit.
str_corpus = str(corpus)

corpus_text = open("sample.txt", "w") #change sample.txt to file name for text file.
corpus_text.write(str_corpus)
corpus_text.close()

The code above extracts textual data from subreddits on Reddit. It then preprocesses the data by removing stop words, punctuation and converting to all lower case. This data is then converted into a text document, which will now be used to create a document-term matrix to be used for probabilistic topic modelling, using sklearn and gensim. 
