In [2]:
import pandas as pd
from datetime import datetime
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import emoji_def
import Slang
import re

PATH = r'C:\Users\Daniel\Documents\NLP Class\Project\\'

In [3]:
chat_words_map_dict = {}
chat_words_list = []
for line in Slang.chat_words_str.split("\n"):
    if line != "" and line != 'QPSA?\tQue Pasa?':
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
train_data = pd.read_csv(PATH + "train_small_same_date.csv")
valid_data = pd.read_csv(PATH + "test_small_same_date.csv")

train_data['body'] = train_data['body'].astype(str)
valid_data['body'] = valid_data['body'].astype(str)

# Preprocessinng guidance from https://www.kaggle.com/sudalairajkumar/getting-started-with-text-preprocessing

# lowercase
train_data["body_lower"] = train_data["body"].str.lower()
valid_data["body_lower"] = valid_data["body"].str.lower()

# remove punctuation
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_data["body_lower_no_punc"] = train_data["body_lower"].apply(lambda text: remove_punctuation(text))
valid_data["body_lower_no_punc"] = valid_data["body_lower"].apply(lambda text: remove_punctuation(text))

# Remove stop words

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_data["body_lower_no_punc_stop"] = train_data["body_lower_no_punc"].apply(lambda text: remove_stopwords(text))
valid_data["body_lower_no_punc_stop"] = valid_data["body_lower_no_punc"].apply(lambda text: remove_stopwords(text))

# Convert Emojis
# emoji definition source: https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py



# def convert_emojis(text):
#     for emot in emoji_def.UNICODE_EMO:
#         text = re.sub(re.escape(r'('+emot+')'), "_".join(emoji_def.UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
#     return text

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

train_data["body_lower_no_punc_stop_emoj"] = train_data["body_lower_no_punc_stop"].apply(lambda text: remove_emoji(text))
valid_data["body_lower_no_punc_stop_emoj"] = valid_data["body_lower_no_punc_stop"].apply(lambda text: remove_emoji(text))

# train_data["body_lower_no_punc_stop_emoj"] = train_data["body_lower_no_punc_stop"]
# valid_data["body_lower_no_punc_stop_emoj"] = valid_data["body_lower_no_punc_stop"]

# lemmatization

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

train_data["body_lower_no_punc_stop_emoj_lem"] = train_data["body_lower_no_punc_stop_emoj"].apply(lambda text: lemmatize_words(text))
valid_data["body_lower_no_punc_stop_emoj_lem"] = valid_data["body_lower_no_punc_stop_emoj"].apply(lambda text: lemmatize_words(text))

# remove URL

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

train_data["body_lower_no_punc_stop_emoj_lem_url"] = train_data["body_lower_no_punc_stop_emoj_lem"].apply(lambda text: remove_urls(text))
valid_data["body_lower_no_punc_stop_emoj_lem_url"] = valid_data["body_lower_no_punc_stop_emoj_lem"].apply(lambda text: remove_urls(text))

# Convert slang to text

chat_words_map_dict = {}
chat_words_list = []
for line in Slang.chat_words_str.split("\n"):
    if line != "" and line != 'QPSA?\tQue Pasa?':
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

train_data["body_lower_no_punc_stop_emoj_lem_url_slang"] = train_data["body_lower_no_punc_stop_emoj_lem_url"].apply(lambda text: chat_words_conversion(text))
valid_data["body_lower_no_punc_stop_emoj_lem_url_slang"] = valid_data["body_lower_no_punc_stop_emoj_lem_url"].apply(lambda text: chat_words_conversion(text))

train_data.to_csv("train_small_same_date_clean.csv")
valid_data.to_csv("test_small_same_date_clean.csv")

KeyboardInterrupt: 

In [25]:
train_data = pd.read_csv(PATH + "train_big_same_date.csv")
valid_data = pd.read_csv(PATH + "test_big_same_date.csv")

train_data['body'] = train_data['body'].astype(str)
valid_data['body'] = valid_data['body'].astype(str)

# Preprocessinng guidance from https://www.kaggle.com/sudalairajkumar/getting-started-with-text-preprocessing

# lowercase
train_data["body_lower"] = train_data["body"].str.lower()
valid_data["body_lower"] = valid_data["body"].str.lower()

# remove punctuation
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_data["body_lower_no_punc"] = train_data["body_lower"].apply(lambda text: remove_punctuation(text))
valid_data["body_lower_no_punc"] = valid_data["body_lower"].apply(lambda text: remove_punctuation(text))

# Remove stop words
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_data["body_lower_no_punc_stop"] = train_data["body_lower_no_punc"].apply(lambda text: remove_stopwords(text))
valid_data["body_lower_no_punc_stop"] = valid_data["body_lower_no_punc"].apply(lambda text: remove_stopwords(text))

# Convert Emojis
# emoji definition source: https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py



# def convert_emojis(text):
#     for emot in emoji_def.UNICODE_EMO:
#         text = re.sub(re.escape(r'('+emot+')'), "_".join(emoji_def.UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
#     return text

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

train_data["body_lower_no_punc_stop_emoj"] = train_data["body_lower_no_punc_stop"].apply(lambda text: remove_emoji(text))
valid_data["body_lower_no_punc_stop_emoj"] = valid_data["body_lower_no_punc_stop"].apply(lambda text: remove_emoji(text))

# train_data["body_lower_no_punc_stop_emoj"] = train_data["body_lower_no_punc_stop"]
# valid_data["body_lower_no_punc_stop_emoj"] = valid_data["body_lower_no_punc_stop"]

# lemmatization
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

train_data["body_lower_no_punc_stop_emoj_lem"] = train_data["body_lower_no_punc_stop_emoj"].apply(lambda text: lemmatize_words(text))
valid_data["body_lower_no_punc_stop_emoj_lem"] = valid_data["body_lower_no_punc_stop_emoj"].apply(lambda text: lemmatize_words(text))

# remove URL

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

train_data["body_lower_no_punc_stop_emoj_lem_url"] = train_data["body_lower_no_punc_stop_emoj_lem"].apply(lambda text: remove_urls(text))
valid_data["body_lower_no_punc_stop_emoj_lem_url"] = valid_data["body_lower_no_punc_stop_emoj_lem"].apply(lambda text: remove_urls(text))

# Convert slang to text

chat_words_map_dict = {}
chat_words_list = []
for line in Slang.chat_words_str.split("\n"):
    if line != "" and line != 'QPSA?\tQue Pasa?':
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

train_data["body_lower_no_punc_stop_emoj_lem_url_slang"] = train_data["body_lower_no_punc_stop_emoj_lem_url"].apply(lambda text: chat_words_conversion(text))
valid_data["body_lower_no_punc_stop_emoj_lem_url_slang"] = valid_data["body_lower_no_punc_stop_emoj_lem_url"].apply(lambda text: chat_words_conversion(text))

train_data.to_csv("train_big_same_date_clean.csv")
valid_data.to_csv("test_big_same_date_clean.csv")

In [26]:
train_data = pd.read_csv(PATH + "train_dev_same_date.csv")
valid_data = pd.read_csv(PATH + "test_dev_same_date.csv")

train_data['body'] = train_data['body'].astype(str)
valid_data['body'] = valid_data['body'].astype(str)

# Preprocessinng guidance from https://www.kaggle.com/sudalairajkumar/getting-started-with-text-preprocessing

# lowercase
train_data["body_lower"] = train_data["body"].str.lower()
valid_data["body_lower"] = valid_data["body"].str.lower()

# remove punctuation
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

train_data["body_lower_no_punc"] = train_data["body_lower"].apply(lambda text: remove_punctuation(text))
valid_data["body_lower_no_punc"] = valid_data["body_lower"].apply(lambda text: remove_punctuation(text))

# Remove stop words
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train_data["body_lower_no_punc_stop"] = train_data["body_lower_no_punc"].apply(lambda text: remove_stopwords(text))
valid_data["body_lower_no_punc_stop"] = valid_data["body_lower_no_punc"].apply(lambda text: remove_stopwords(text))

# Convert Emojis
# emoji definition source: https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py



# def convert_emojis(text):
#     for emot in emoji_def.UNICODE_EMO:
#         text = re.sub(re.escape(r'('+emot+')'), "_".join(emoji_def.UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
#     return text

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

train_data["body_lower_no_punc_stop_emoj"] = train_data["body_lower_no_punc_stop"].apply(lambda text: remove_emoji(text))
valid_data["body_lower_no_punc_stop_emoj"] = valid_data["body_lower_no_punc_stop"].apply(lambda text: remove_emoji(text))

# train_data["body_lower_no_punc_stop_emoj"] = train_data["body_lower_no_punc_stop"]
# valid_data["body_lower_no_punc_stop_emoj"] = valid_data["body_lower_no_punc_stop"]


# lemmatization
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

train_data["body_lower_no_punc_stop_emoj_lem"] = train_data["body_lower_no_punc_stop_emoj"].apply(lambda text: lemmatize_words(text))
valid_data["body_lower_no_punc_stop_emoj_lem"] = valid_data["body_lower_no_punc_stop_emoj"].apply(lambda text: lemmatize_words(text))

# remove URL

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

train_data["body_lower_no_punc_stop_emoj_lem_url"] = train_data["body_lower_no_punc_stop_emoj_lem"].apply(lambda text: remove_urls(text))
valid_data["body_lower_no_punc_stop_emoj_lem_url"] = valid_data["body_lower_no_punc_stop_emoj_lem"].apply(lambda text: remove_urls(text))

# Convert slang to text

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

train_data["body_lower_no_punc_stop_emoj_lem_url_slang"] = train_data["body_lower_no_punc_stop_emoj_lem_url"].apply(lambda text: chat_words_conversion(text))
valid_data["body_lower_no_punc_stop_emoj_lem_url_slang"] = valid_data["body_lower_no_punc_stop_emoj_lem_url"].apply(lambda text: chat_words_conversion(text))

train_data.to_csv("train_dev_same_date_clean.csv")
valid_data.to_csv("test_dev_same_date_clean.csv")

In [27]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0_x,body,created_utc,id,kind,parent,subreddit,utc_raw,utc_dt,...,volume,move,month,body_lower,body_lower_no_punc,body_lower_no_punc_stop,body_lower_no_punc_stop_emoj,body_lower_no_punc_stop_emoj_lem,body_lower_no_punc_stop_emoj_lem_url,body_lower_no_punc_stop_emoj_lem_url_slang
0,145755,166303,He sold for a 10 mil profit already. I like DV...,2021-03-29T19:58:14Z,gsr28j8,t2,t1_gsqpxip,GME,1617066000.0,2021-03-29 19:58:14+00:00,...,10075068.0,1,3,he sold for a 10 mil profit already. i like dv...,he sold for a 10 mil profit already i like dvf...,sold 10 mil profit already like dvf dont under...,sold 10 mil profit already like dvf dont under...,sell 10 mil profit already like dvf dont under...,sell 10 mil profit already like dvf dont under...,sell 10 mil profit already like dvf dont under...
1,406448,443440,"Honestly, this shit got me up earlier, and goi...",2021-04-01T19:25:47Z,gt3bxxc,t2,t3_mi3eli,GME,1617323000.0,2021-04-01 19:25:47+00:00,...,9351417.0,-1,4,"honestly, this shit got me up earlier, and goi...",honestly this shit got me up earlier and going...,honestly shit got earlier going bed earlier pu...,honestly shit got earlier going bed earlier pu...,honestly shit get earlier go bed earlier put u...,honestly shit get earlier go bed earlier put u...,honestly shit get earlier go bed earlier put u...
2,226054,673216,I appreciate the confidence. I will try this w...,2021-03-11T19:02:09Z,gqn11oa,t2,t1_gqmz84n,GME,1615511000.0,2021-03-11 19:02:09+00:00,...,28402777.0,1,3,i appreciate the confidence. i will try this w...,i appreciate the confidence i will try this wh...,appreciate confidence try next see,appreciate confidence try next see,appreciate confidence try next see,appreciate confidence try next see,appreciate confidence try next see
3,319915,1091065,Insider knowledge. Shill idiot doesn't realize...,2021-03-03T17:12:10Z,gplau59,t2,t3_lwuqgq,GME,1614813000.0,2021-03-03 17:12:10+00:00,...,19325198.0,1,3,insider knowledge. shill idiot doesn't realize...,insider knowledge shill idiot doesnt realize t...,insider knowledge shill idiot doesnt realize c...,insider knowledge shill idiot doesnt realize c...,insider knowledge shill idiot doesnt realize c...,insider knowledge shill idiot doesnt realize c...,insider knowledge shill idiot doesnt realize c...
4,400977,259725,Mods r fuk 🌈🐻,2021-04-05T17:06:11Z,gti44np,t2,t3_mkizw4,GME,1617660000.0,2021-04-05 17:06:11+00:00,...,14223605.0,1,4,mods r fuk 🌈🐻,mods r fuk 🌈🐻,mods r fuk 🌈🐻,mods r fuk,mod r fuk,mod r fuk,mod r fuk
