In [31]:
import easydict 
from copy import deepcopy
import numpy as np
import pandas as pd
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

from torchtext import data

from ignite.engine import Engine
from ignite.engine import Events
from ignite.metrics import RunningAverage
from ignite.contrib.handlers.tqdm_logger import ProgressBar

import nltk
from nltk.corpus import stopwords as stopwords_nltk
from nltk.tokenize import word_tokenize

In [27]:


example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example)

result = []
for w in word_tokens:
    if w not in stop_words:
        result.append(w)

print(word_tokens)
print(result)

['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']


In [30]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [28]:
train = pd.read_csv('./data/train.csv', encoding='utf-8')
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [2]:

# config = easydict.EasyDict({ 
#     "model_fn": './models/rnn.pth', 
#     "train_fn": './data/train.1.csv',
#     "gpu_id": 0,
#     "verbose": 2,
#     "min_vocab_freq": 5,
#     "max_vocab_size": 999999,
#     "batch_size": 128,
#     "n_epochs": 10,
#     "word_vec_size": 256,
#     "dropout": 0.3,
#     "max_length": 256,
#     "rnn": True,
#     "hidden_size": 512,
#     "n_layers": 4,
# })


## preprocessing

In [32]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)


def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

def remove_stopwords_nltk(text):
    word_tokens = word_tokenize(text)
    final_text = []
    for w in word_tokens:
        if w not in stopwords_nltk:
            final_text.append(w)
    return " ".join(final_text)

stopwords_nltk = set(stopwords.words('english'))


AttributeError: 'list' object has no attribute 'words'

In [4]:
train = pd.read_csv('./data/train.csv', encoding='utf-8')
train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
train.drop(['index'], axis=1, inplace=True)
train.to_csv('./data/train.1.csv', index=False)

In [7]:
test = pd.read_csv('./data/test_x.csv', encoding='utf-8')
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_stopwords)
test.drop(['index'], axis=1, inplace=True)
test.to_csv('./data/test_x.1.csv', index=False)

                                                    text
0      not think one charming young ladies ever met m...
1      no replied sudden consciousness not find canno...
2      lady stated intention screaming course screame...
3      suddenly silence heard sound sent heart mouth ...
4      conviction remained unchanged far knowand beli...
...                                                  ...
19612  end another day two odin growing visibly stron...
19613  afternoon sat together mostly silence watching...
19614  odin carried thanks odin proceeded happiness l...
19615  soon upon odins leaving room mama said odin al...
19616  worse doomed man denouncer wellknown citizen a...

[19617 rows x 1 columns]
