# Reddit Classification: 
### Using NLP to predict, which subreddit a particular post or comment came from.
By: Nick Lomeli

----
## Part 1: Data Collection, Cleaning, and Transforming

In [1]:
import numpy as np
import pandas as pd

import requests
import json

from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

pd.set_option('display.max_rows', 50)

-------

### Obtain auth / access token from the reddit API

In [2]:
client_id = 'MS6yOenwdShJwO324Nf_cA'
secret_id = 'TEwJl2TLEl3sa5B76Zmm59FUggAIRA'

auth = requests.auth.HTTPBasicAuth(client_id, secret_id)

with open ('../../Notes/Breakfast_Hours/pw.txt', 'r') as f:
    pw = f.read()

data = {
    'grant_type': 'password',
    'username': 'Lower_Lemon9227',
    'password': pw
}

headers = {'User-Agent': 'MyAPI/0.02'}

res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth = auth,
                    data = data,
                    headers = headers
                   )

access_token = res.json()['access_token']

headers['authorization'] = f'bearer {access_token}'

### Create a function that:
- Uses the Reddit API to web-scrape data from a specified subreddit.
- Filters posts from the subreddit to require that it has a time stamp, title, text, comments, and a score before including any in the web-scrape.
- Converts the data into a DataFrame and cleans it for readability.
- Prints the unique id of the last post from each web-scraping iteration. 
     - The last comment's unique id will later be passed to a subsequent function that will use it as a starting point reference for the next web-scraping iteration 

In [3]:
def get_data_initial(subreddit, get_type):
    
    # URL
    url = f'https://oauth.reddit.com/r/{subreddit}/{get_type}'
    
    # Add params
    params = {'filter': ['created_utc', subreddit, 'title', 'selftext', 'num_comments', 'score']
             }
    
    # Get the data
    res = requests.get(url, 
                      headers = headers
                 )
    
    # Convert the request into a list of dict objects
    data = res.json()
    
    # Go deeper to clean up data and create desired columns based on available keys
    n = len(data['data']['children'])
    created_utc_col = [data['data']['children'][i]['data']['created_utc'] for i in range(1, n)]
    subreddit_col = [data['data']['children'][i]['data']['subreddit'] for i in range(1, n)]
    title_col = [data['data']['children'][i]['data']['title'] for i in range(1, n)]
    selftext_col = [data['data']['children'][i]['data']['selftext'] for i in range(1, n)]
    num_comments_col = [data['data']['children'][i]['data']['num_comments'] for i in range(1, n)]
    score_col = [data['data']['children'][i]['data']['score'] for i in range(1, n)]

    #Turn into a DataFrame
    df = pd.DataFrame({
        'created_utc': created_utc_col,
        'subreddit': subreddit_col,
        'title': title_col,
        'text': selftext_col,
        'num_comments': num_comments_col,
        'score': score_col
    })
    
    comment_id = f"{res.json()['data']['children'][-1]['kind']}_{res.json()['data']['children'][-1]['data']['id']}"

    print(comment_id)
    
    return df

### Create a second function that:
- Accepts a subreddit, subreddit post type, and unique comment id as an argument
    - Uses the unique comment id passed as a place to start web-scraping posts/comments after.
        - the unique comment id passed should be the last id printed from the previous iteration.
    - Completes the same steps as in the function above

In [4]:
def get_data_after(subreddit, get_type, last_comment_id):
    
    # URL
    url = f'https://oauth.reddit.com/r/{subreddit}/{get_type}'
    
    # Add params
    params = {'filter': ['created_utc', subreddit, 'title', 'selftext', 'num_comments', 'score'],
              'after': last_comment_id
             }
    
    # Get the data
    res = requests.get(url, 
                      headers = headers,
                      params = params,
                 )
    
    # Convert the request into a list of dict objects
    data = res.json()
    
    # Go deeper to clean up data and create desired columns based on available keys
    n = len(data['data']['children'])
    created_utc_col = [data['data']['children'][i]['data']['created_utc'] for i in range(1, n)]
    subreddit_col = [data['data']['children'][i]['data']['subreddit'] for i in range(1, n)]
    title_col = [data['data']['children'][i]['data']['title'] for i in range(1, n)]
    selftext_col = [data['data']['children'][i]['data']['selftext'] for i in range(1, n)]
    num_comments_col = [data['data']['children'][i]['data']['num_comments'] for i in range(1, n)]
    score_col = [data['data']['children'][i]['data']['score'] for i in range(1, n)]


    
    #Turn into a DataFrame
    df = pd.DataFrame({
        'created_utc': created_utc_col,
        'subreddit': subreddit_col,
        'title': title_col,
        'text': selftext_col,
        'num_comments': num_comments_col,
        'score': score_col
    })
    
    comment_id = f"{res.json()['data']['children'][-1]['kind']}_{res.json()['data']['children'][-1]['data']['id']}"
    
    print(comment_id) 
    
    return df

## Subreddit 1: **Python**

### Web-scrape data from the **python** subreddit and combine all requests into one DataFrame.
##### Note that each subreddit has "hot", "new", and other sections within it. I am most interested in accessing the posts from the "hot" and "new" sections.

In [5]:
python_hot_1 = get_data_initial('python', 'hot')

t3_160imya


In [6]:
python_hot_2 = get_data_after('python', 'hot', 't3_160apiq')

t3_1600evn


In [7]:
python_hot_3 = get_data_after('python', 'hot', 't3_1600evn')

t3_15z4wmy


In [8]:
python_hot_4 = get_data_after('python', 'hot', 't3_15z4wmy')

t3_15x7so4


In [9]:
python_hot_5 = get_data_after('python', 'hot', 't3_15x7so4')

t3_15wywbj


In [10]:
python_hot_6 = get_data_after('python', 'hot', 't3_15wywbj')

t3_15uscqf


In [11]:
python_hot_7 = get_data_after('python', 'hot', 't3_15tz2y7')

t3_15t503x


In [12]:
python_hot_8 = get_data_after('python', 'hot', 't3_15t503x')

t3_15s386w


In [13]:
python_hot_9 = get_data_after('python', 'hot', 't3_15s386w')

t3_15pzlt1


In [14]:
python_hot_10 = get_data_after('python', 'hot', 't3_15pzlt1')

t3_15p34aj


In [15]:
python_hot_11 = get_data_after('python', 'hot', 't3_15p34aj')

t3_15nhegh


In [16]:
python_hot_12 = get_data_after('python', 'hot', 't3_15nhegh')

t3_15mjrnr


In [17]:
python_hot_13 = get_data_after('python', 'hot', 't3_15mjrnr')

t3_15le96q


In [18]:
python_hot_14 = get_data_after('python', 'hot', 't3_15le96q')

t3_15k4qwj


In [19]:
python_hot_15 = get_data_after('python', 'hot', 't3_15k4qwj')

t3_15iy8f0


In [20]:
python_new_1 = get_data_initial('python', 'new')

t3_161678v


In [21]:
python_new_2 = get_data_after('python', 'new', 't3_161481e')

t3_1600evn


In [22]:
python_new_3 = get_data_after('python', 'new', 't3_1600evn')

t3_15yu4if


In [23]:
python_new_4 = get_data_after('python', 'new', 't3_15yu4if')

t3_15xjs95


In [24]:
python_new_5 = get_data_after('python', 'new', 't3_15xjs95')

t3_15wcrev


In [25]:
python_new_6 = get_data_after('python', 'new', 't3_15wcrev')

t3_15ulfrw


In [26]:
python_new_7 = get_data_after('python', 'new', 't3_15ulfrw')

t3_15tnu99


In [27]:
python_new_8 = get_data_after('python', 'new', 't3_15tnu99')

t3_15s2lxn


In [28]:
python_new_9 = get_data_after('python', 'new', 't3_15s2lxn')

t3_15qq0ws


In [29]:
python_new_10 = get_data_after('python', 'new', 't3_15qq0ws')

t3_15pfuyl


In [30]:
python_new_11 = get_data_after('python', 'new', 't3_15pfuyl')

t3_15o5h7b


In [31]:
python_new_12 = get_data_after('python', 'new', 't3_15o5h7b')

t3_15mnhgo


In [32]:
python_new_13 = get_data_after('python', 'new', 't3_15mnhgo')

t3_15lkt96


In [33]:
python_new_14 = get_data_after('python', 'new', 't3_15lkt96')

t3_15k0l15


In [34]:
python_new_15 = get_data_after('python', 'new', 't3_15k0l15')

t3_15ixoqd


### Concatenate the data from all python subreddit web-scrapes into one DataFrame

In [35]:
df_python = pd.concat([
    python_hot_1, 
    python_hot_2, 
    python_hot_3, 
    python_hot_4, 
    python_hot_5, 
    python_hot_6, 
    python_hot_7, 
    python_hot_8, 
    python_hot_9, 
    python_hot_10,
    python_hot_11,
    python_hot_12,
    python_hot_13,
    python_hot_14,
    python_hot_15,
    
    python_new_1,
    python_new_2,
    python_new_3,
    python_new_4,
    python_new_5,
    python_new_6,
    python_new_7,
    python_new_8,
    python_new_9,
    python_new_10,
    python_new_11,
    python_new_12,
    python_new_13,
    python_new_14,
    python_new_15
])

In [36]:
df_python.shape

(722, 6)

In [37]:
df_python.head()

Unnamed: 0,created_utc,subreddit,title,text,num_comments,score
0,1693008000.0,Python,Saturday Daily Thread: Resource Request and Sh...,Found a neat resource related to Python over t...,1,1
1,1693044000.0,Python,Understanding Immortal Objects in Python 3.12:...,,10,127
2,1693095000.0,Python,Inference Llama 2 in one file of pure Python w...,"Hi everyone,\n\nHave you ever wondering how to...",0,7
3,1693062000.0,Python,FastAPI + HTMX hello world demo app,Hello world!\n\nAs an effort in open source co...,6,19
4,1693058000.0,Python,Robyn crosses 1M installs on PyPi,"For the unaware, Robyn is a fast, async Python...",2,20


In [38]:
len(df_python['title'].unique())

365

In [39]:
len(df_python['text'].unique())

252

## Subreddit 2: Javascript

### Webscrape data from the **javascript** and **java** subreddits and combine all requests into one DataFrame.
##### **Javascript** subreddit did not have enough posts to equate to amount obtained from the python subreddit. Therefore, I included the **java** subreddit as well.*

In [40]:
java_hot_1 = get_data_initial('javascript', 'hot')

t3_1468vnz


In [41]:
java_hot_2 = get_data_after('javascript', 'hot', 't3_1468vnz')

t3_144d2bb


In [42]:
java_hot_3 = get_data_after('javascript', 'hot', 't3_144d2bb')

t3_14291sm


In [43]:
java_hot_4 = get_data_after('javascript', 'hot', 't3_14291sm')

t3_13yhf7v


In [44]:
java_hot_5 = get_data_after('javascript', 'hot', 't3_13yhf7v')

t3_13uwv1a


In [45]:
java_hot_6 = get_data_after('javascript', 'hot', 't3_13uwv1a')

t3_13sizi3


In [46]:
java_hot_7 = get_data_after('javascript', 'hot', 't3_13sizi3')

t3_13r5zh7


In [47]:
java_hot_8 = get_data_after('javascript', 'hot', 't3_13r5zh7')

t3_13mqzkg


In [48]:
java_hot_9 = get_data_after('javascript', 'hot', 't3_13mqzkg')

t3_13kslkc


In [49]:
java_hot_10 = get_data_after('javascript', 'hot', 't3_13kslkc')

t3_13i9k74


In [50]:
java_hot_11 = get_data_after('javascript', 'hot', 't3_13i9k74')

t3_13eodzb


In [51]:
java_hot_12 = get_data_after('javascript', 'hot', 't3_13eodzb')

t3_13djtvp


In [52]:
java_hot_13 = get_data_after('javascript', 'hot', 't3_13djtvp')

t3_137v6f2


In [53]:
java_hot_14 = get_data_after('javascript', 'hot', 't3_137v6f2')

t3_136h57w


In [54]:
java_hot_15 = get_data_after('javascript', 'hot', 't3_136h57w')

t3_1347fui


In [55]:
java_hot_16 = get_data_after('javascript', 'hot', 't3_1347fui')

t3_130k7et


In [56]:
java_hot_17 = get_data_after('javascript', 'hot', 't3_130k7et')

t3_12u4h8h


In [57]:
java_hot_18 = get_data_after('javascript', 'hot', 't3_12u4h8h')

t3_11tsto7


In [58]:
java_hot_19 = get_data_initial('java', 'hot')

t3_15wf0a0


In [59]:
java_hot_20 = get_data_after('java', 'hot', 't3_15wf0a0')

t3_15oed3x


In [60]:
java_hot_21 = get_data_after('java', 'hot', 't3_15oed3x')

t3_15hxp2f


In [61]:
java_hot_22 = get_data_after('java', 'hot', 't3_15hxp2f')

t3_15ca0hn


In [62]:
java_hot_23 = get_data_after('java', 'hot', 't3_15ca0hn')

t3_154x68l


In [63]:
java_hot_24 = get_data_after('java', 'hot', 't3_154x68l')

t3_14yhzm5


In [64]:
java_hot_25 = get_data_after('java', 'hot', 't3_14yhzm5')

t3_14qissk


In [65]:
java_hot_26 = get_data_after('java', 'hot', 't3_14qissk')

t3_14kixyk


In [66]:
java_hot_27 = get_data_after('java', 'hot', 't3_14kixyk')

t3_14f4xst


In [67]:
java_hot_28 = get_data_after('java', 'hot', 't3_14f4xst')

t3_143mqtz


In [68]:
java_hot_29 = get_data_after('java', 'hot', 't3_143mqtz')

t3_13wxcti


In [69]:
java_hot_30 = get_data_after('java', 'hot', 't3_13wxcti')

t3_13mp9g0


### Concatenate the data from all javascript subreddit web-scrapes into one DataFrame

In [70]:
df_java = pd.concat([
    java_hot_1,
    java_hot_2,
    java_hot_3,
    java_hot_4,
    java_hot_5,
    java_hot_6,
    java_hot_7,
    java_hot_8,
    java_hot_9,
    java_hot_10,
    java_hot_11,
    java_hot_12,
    java_hot_13,
    java_hot_14,
    java_hot_15,
    java_hot_16,
    java_hot_17,
    java_hot_18,
    java_hot_19,
    java_hot_20,
    java_hot_21,
    java_hot_22,
    java_hot_23,
    java_hot_24,
    java_hot_25,
    java_hot_26,
    java_hot_27,
    java_hot_28,
    java_hot_29,
    java_hot_30
])

In [71]:
df_java.shape

(710, 6)

In [72]:
df_java.head()

Unnamed: 0,created_utc,subreddit,title,text,num_comments,score
0,1692609000.0,javascript,Your /r/javascript recap for the week of Augus...,"**Monday, August 14 - Sunday, August 20**\n\n#...",4,0
1,1692774000.0,javascript,"WTF Wednesday (August 23, 2023)",Post a link to a GitHub repo or another code c...,6,23
2,1692428000.0,javascript,"Showoff Saturday (August 19, 2023)",Did you find or create something cool this wee...,22,22
3,1692169000.0,javascript,"WTF Wednesday (August 16, 2023)",Post a link to a GitHub repo or another code c...,18,39
4,1692004000.0,javascript,Your /r/javascript recap for the week of Augus...,"**Monday, August 07 - Sunday, August 13**\n\n#...",1,21


## Concatenate the Python and Java DataFrames

In [73]:
df_programming = pd.concat([df_python, df_java])

----

## Create functions to lemmatize and stem text and title of each post in both 

### Function to lemmatize

In [1]:
def text_lemmatizer(review):
    
    # Set token and instantiate Lemmatize
    lemmatizer = WordNetLemmatizer()
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+")   

    # Tokenize words
    words = my_tokenizer.tokenize(review.lower())
    
    # Remove stopwords and add any words that should be included
    stop_word_list = stopwords.words('english')
    
    non_stop_words = [word for word in words if word not in stop_word_list]
    
    # Lemmatize
    review_lem = [lemmatizer.lemmatize(word) for word in non_stop_words]
    
    
    # Put words back together into a single string. 
    return ' '.join(review_lem)

----
### Add Lemmatized text to a new column in both DataFrames

In [75]:
df_programming['lemm_text'] = df_programming['text'].map(text_lemmatizer)
df_programming['lemm_title'] = df_programming['title'].map(text_lemmatizer)

In [76]:
df_programming.head(2)

Unnamed: 0,created_utc,subreddit,title,text,num_comments,score,lemm_text,lemm_title
0,1693008000.0,Python,Saturday Daily Thread: Resource Request and Sh...,Found a neat resource related to Python over t...,1,1,found neat resource related past week looking ...,saturday daily thread resource request sharing...
1,1693044000.0,Python,Understanding Immortal Objects in Python 3.12:...,,10,127,,understanding immortal object 3 12 deep dive i...


----

### Function to Stem the values in the text and title columns

In [2]:
def text_stem(review):
    
    # Set token and instantiate Stem
    p_stem = PorterStemmer()
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+")   

    # Tokenize words
    words = my_tokenizer.tokenize(review.lower())
    
    # Remove stop words and add words to stop word list that should be included
    stop_word_list = stopwords.words('english')
    
    non_stop_words = [word for word in words if word not in stop_word_list]
    
    # Stem
    review_stem = [p_stem.stem(word) for word in non_stop_words]    
    
    # Put words back together into a single string. It is easier to work with a string than a list for NLP.
    return ' '.join(review_stem)

----
### Add Stemmed text to a new column in each DataFrame

In [78]:
df_programming['stem_text'] = df_programming['text'].map(text_stem)
df_programming['stem_title'] = df_programming['title'].map(text_stem)

---
### Remove columns that won't be used in the model.

In [79]:
df_programming.drop(columns = ['created_utc', 'title', 'text', 'num_comments', 'score'], inplace = True)

In [80]:
df_programming.head(2)

Unnamed: 0,subreddit,lemm_text,lemm_title,stem_text,stem_title
0,Python,found neat resource related past week looking ...,saturday daily thread resource request sharing...,found neat resourc relat past week look resour...,saturday daili thread resourc request share da...
1,Python,,understanding immortal object 3 12 deep dive i...,,understand immort object 3 12 deep dive intern


---
### Encode the target variable

In [81]:
df_programming.insert(loc = 1,
                      column = 'target',
                      value = np.where(df_programming['subreddit'] == 'Python', 1, 0)
                     )

In [82]:
df_programming['target'].value_counts(normalize = True)

1    0.50419
0    0.49581
Name: target, dtype: float64

-----

#### Note that there are some observations with missing values. To alleviate, this I decided to copy the lemmatized text to the missing values in the lemmatize title feature of the same observation and vice versa. The same will be done for stemmed text and stemmed title.

#### Creating a new DataFrame that only contains lemm_text, lemm_title, stem_text and stem_title, which will be the datase used for modeling.

In [83]:
df_programming.head()

Unnamed: 0,subreddit,target,lemm_text,lemm_title,stem_text,stem_title
0,Python,1,found neat resource related past week looking ...,saturday daily thread resource request sharing...,found neat resourc relat past week look resour...,saturday daili thread resourc request share da...
1,Python,1,,understanding immortal object 3 12 deep dive i...,,understand immort object 3 12 deep dive intern
2,Python,1,hi everyone ever wondering implement complex s...,inference llama 2 one file pure zero dependency,hi everyon ever wonder implement complex scien...,infer llama 2 one file pure zero depend
3,Python,1,hello world effort open source contribution ma...,fastapi htmx hello world demo app,hello world effort open sourc contribut made f...,fastapi htmx hello world demo app
4,Python,1,unaware robyn fast async web framework rust ru...,robyn cross 1m installs pypi,unawar robyn fast async web framework rust run...,robyn cross 1m instal pypi


In [84]:
df_programming_filled = df_programming.copy()

df_programming_filled['lemm_text'] = np.where(df_programming_filled['lemm_text'] == '', df_programming_filled['lemm_title'], df_programming_filled['lemm_text'])

df_programming_filled['lemm_title'] = np.where(df_programming_filled['lemm_title'] == '', df_programming_filled['lemm_text'], df_programming_filled['lemm_title'])

df_programming_filled['stem_text'] = np.where(df_programming_filled['stem_text'] == '', df_programming_filled['stem_title'], df_programming_filled['stem_text'])

df_programming_filled['stem_title'] = np.where(df_programming_filled['stem_title'] == '', df_programming_filled['stem_text'], df_programming_filled['stem_title'])

In [85]:
df_programming_filled.head()

Unnamed: 0,subreddit,target,lemm_text,lemm_title,stem_text,stem_title
0,Python,1,found neat resource related past week looking ...,saturday daily thread resource request sharing...,found neat resourc relat past week look resour...,saturday daili thread resourc request share da...
1,Python,1,understanding immortal object 3 12 deep dive i...,understanding immortal object 3 12 deep dive i...,understand immort object 3 12 deep dive intern,understand immort object 3 12 deep dive intern
2,Python,1,hi everyone ever wondering implement complex s...,inference llama 2 one file pure zero dependency,hi everyon ever wonder implement complex scien...,infer llama 2 one file pure zero depend
3,Python,1,hello world effort open source contribution ma...,fastapi htmx hello world demo app,hello world effort open sourc contribut made f...,fastapi htmx hello world demo app
4,Python,1,unaware robyn fast async web framework rust ru...,robyn cross 1m installs pypi,unawar robyn fast async web framework rust run...,robyn cross 1m instal pypi


----
----

### Export the final DataFrame as a csv and use it to begin modeling

In [86]:
df_programming_filled.to_csv('./P4_Datasets/programming_languages')