In [1]:
""""Cleaning data for usage with our NLP model."""

import pandas as pd
import re

df = pd.read_csv('datasets/fetched_data.csv')

df.head(10)

Unnamed: 0,Subreddit,Text
0,24hoursupport,* * *\n\n* * *\n\nAs of **[April 2014](http://...
1,24hoursupport,I was trying to download a streaming movie/tv ...
2,Android,"I know the title is rather sensational, howeve..."
3,Android,A notable amount of people already bought new ...
4,Applehelp,EDIT: Going to bed now (hopefully I can sleep)...
5,asktechnology,Hello! I am currently trying to create a massi...
6,asktechnology,He is 25. Is there anything new cool gadgets t...
7,asktechnology,"I have some shitty ""anti virus"" called seguraz..."
8,asktechnology,I really like this guy who works for IT at my ...
9,asktechnology,Hello Reddit\n\nSorry if this isn't the best p...


In [2]:
df['Text'] = df['Text'].str.replace('\n', ' ')  # Replaces line breaks with spaces
df['Text'] = df['Text'].str.replace('*', '')  # Removes asterisks
df = df[df['Text'] != ' [](/resolved)'].reset_index(drop=True) # Remove blank resolved posts.
df.head(25)

Unnamed: 0,Subreddit,Text
0,24hoursupport,As of [April 2014](http://codepen.io/T...
1,24hoursupport,I was trying to download a streaming movie/tv ...
2,Android,"I know the title is rather sensational, howeve..."
3,Android,A notable amount of people already bought new ...
4,Applehelp,EDIT: Going to bed now (hopefully I can sleep)...
5,asktechnology,Hello! I am currently trying to create a massi...
6,asktechnology,He is 25. Is there anything new cool gadgets t...
7,asktechnology,"I have some shitty ""anti virus"" called seguraz..."
8,asktechnology,I really like this guy who works for IT at my ...
9,asktechnology,Hello Reddit Sorry if this isn't the best pla...


In [3]:
# Removes URLs
url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
df['Text'] = df['Text'].apply(lambda x: re.sub(url_regex, '', x))

# Removes anything in brackets (generally irrelevant; our use case is in finding the proper technical support subreddit)
df['Text'] = df['Text'].apply(lambda x: re.sub(r'\[.*?\]', '', x))

In [4]:
df.tail(25)  # Checking things out on this end; looks good.

Unnamed: 0,Subreddit,Text
107,talesfromtechsupport,"So a few years back, I was working in a manufa..."
108,talesfromtechsupport,This actually happened a couple of weeks ago. ...
109,techsupport,Figured I should write this stuff down before ...
110,techsupport,Ｕｓｉｎｇ　ａ　Ｊａｐａｎｅｓｅ　ｌａｐｔｏｐ　ｋｅｙｂｏａｒｄ　ｗｉｔｈ　ａ　ｂｕｎｃｈ　...
111,techsupport,I have been at it 6 hours trying to do it myse...
112,techsupport,## Official Malware Removal Guide ^(by:) /u/c...
113,techsupport,"Alright, /r/techsupport. Time for some real t..."
114,techsupport,"A lot of times, I had a technical problem, but..."
115,techsupport,"So around last October, a friend of mine had p..."
116,techsupport,"About two weeks ago, the thumbnails for my fav..."


In [None]:
# Tokenizing
import spacy

nlp = spacy.load('en_core_web_lg')

tokenizer = spacy.Tokenizer(nlp.vocab)

# Extending stop words relative to our use case (technical support subreddits)
STOP_WORDS = nlp.Defaults.stop_words.union(["doesnt", "wont", "cant"])

for doc in tokenizer.pipe(df['Text'].astype('unicode')):
    
    doc_tokens = []
    
    for token in doc:
        if (token.text.lower() not in STOP_WORDS) & (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
        
df['Tokens'] = tokens

df.head()

In [None]:
df.to_csv('datasets/cleaned_data.csv', index=False)