In [57]:
import easydict 
from copy import deepcopy
import numpy as np
import pandas as pd
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

from torchtext import data

from ignite.engine import Engine
from ignite.engine import Events
from ignite.metrics import RunningAverage
from ignite.contrib.handlers.tqdm_logger import ProgressBar

import nltk
from nltk.corpus import stopwords as stopwords_nltk
from nltk.tokenize import word_tokenize

In [2]:

# config = easydict.EasyDict({ 
#     "model_fn": './models/rnn.pth', 
#     "train_fn": './data/train.1.csv',
#     "gpu_id": 0,
#     "verbose": 2,
#     "min_vocab_freq": 5,
#     "max_vocab_size": 999999,
#     "batch_size": 128,
#     "n_epochs": 10,
#     "word_vec_size": 256,
#     "dropout": 0.3,
#     "max_length": 256,
#     "rnn": True,
#     "hidden_size": 512,
#     "n_layers": 4,
# })


## preprocessing

In [58]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def remove_stopwords_nltk(text):
    word_tokens = word_tokenize(text)
    final_text = []
    for w in word_tokens:
        if w not in stopwords_nltk:
            final_text.append(w)
    return " ".join(final_text)

stopwords_nltk = set(stopwords_nltk.words('english'))


In [59]:
train = pd.read_csv('./data/train.csv', encoding='utf-8')
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords_nltk)
train.drop(['index'], axis=1, inplace=True)
# train[['author']] = train['author'].apply(lambda x : int(x))
train.to_csv('./data/train.nltk.csv', index=False)

In [60]:
test = pd.read_csv('./data/test_x.csv', encoding='utf-8')
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords_nltk)
test.drop(['index'], axis=1, inplace=True)
# test[['author']] = test['author'].apply(lambda x : int(x))
test.to_csv('./data/test_x.nltk.1.csv', index=False)

In [37]:
train = pd.read_csv('./data/train.csv', encoding='utf-8')
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
train.drop(['index'], axis=1, inplace=True)
train.to_csv('./data/train.1.csv', index=False)

                                                 text  author
0   almost choking much much wanted say strange ex...       3
1                                sister asked suppose       2
2   engaged one day walked perusing janes last let...       1
3   captain porch keeping carefully way treacherou...       4
4   mercy gentlemen odin flung hands dont write an...       3
5     well fought said sooth will not charge us twice       4
6   not pay impossible considering character will ...       3
7   proper figure man atarms said little knight ma...       2
8                          not last sunday night said       0
9   must not ask cried hell may noble flames known...       4
10  unexpected piece luck data coming quickly reas...       2
11     one rogue fewer dare say observed master house       4
12  scant luggage take london little little posses...       0
13  suited odin best think odin one preferred acco...       1
14                                    no friends said       4


In [7]:
test = pd.read_csv('./data/test_x.csv', encoding='utf-8')
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test.drop(['index'], axis=1, inplace=True)
test.to_csv('./data/test_x.1.csv', index=False)

                                                    text
0      not think one charming young ladies ever met m...
1      no replied sudden consciousness not find canno...
2      lady stated intention screaming course screame...
3      suddenly silence heard sound sent heart mouth ...
4      conviction remained unchanged far knowand beli...
...                                                  ...
19612  end another day two odin growing visibly stron...
19613  afternoon sat together mostly silence watching...
19614  odin carried thanks odin proceeded happiness l...
19615  soon upon odins leaving room mama said odin al...
19616  worse doomed man denouncer wellknown citizen a...

[19617 rows x 1 columns]
