In [20]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import re

In [21]:
import nltk
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Read in Raw Data

In [22]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/valid.csv')

train.shape[0], test.shape[0]

(45000, 15000)

In [23]:
train.head(2)

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ


In [24]:
# train.loc[0].CreationDate

In [25]:
# all_data = pd.concat([train, test]).reset_index(drop = True)

In [26]:
# all_data.shape

In [27]:
# train.query('Y == "HQ"').shape[0], test.query('Y == "HQ"').shape[0]

In [28]:
# train.query('Y == "LQ_CLOSE"').shape[0], test.query('Y == "LQ_CLOSE"').shape[0]

In [29]:
# train.query('Y == "LQ_EDIT"').shape[0], test.query('Y == "LQ_CLOSE"').shape[0]

# 2. Data Cleaning

## 2.1 Check NaN

In [30]:
train.isna().sum()

Id              0
Title           0
Body            0
Tags            0
CreationDate    0
Y               0
dtype: int64

In [31]:
test.isna().sum()

Id              0
Title           0
Body            0
Tags            0
CreationDate    0
Y               0
dtype: int64

## 2.2 Clean Label

In [32]:
label_dict = {
    'LQ_CLOSE': 0,
    'LQ_EDIT': 1,
    'HQ': 2
}

In [33]:
train['cleaned_y'] = train.Y.apply(lambda x: label_dict[x])
test['cleaned_y'] = test.Y.apply(lambda x: label_dict[x])

## 2.3 Clean Title

In [34]:
# title
train.Title = train.Title.apply(lambda x: x.lower())
test.Title = test.Title.apply(lambda x: x.lower())

In [35]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stopwords(string):
    word_list = [word.lower() for word in string.split()]
    stopwords_list = list(stopwords.words("english"))
    for word in word_list:
        if word in stopwords_list:
            word_list.remove(word)
    return ' '.join(word_list)

def clean_text(text):
    text = re.sub('\\n', ' ', text)
    text = re.sub('\W', ' ', text)
    text = re.sub(r'https\s+|www.\s+', r'', text)
    text = re.sub(r'http\s+|www.\s+',r'', text)
    text = re.sub(r'\s+[a-zA-Z]\s+',' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+',' ', text)
    text = text.lower()
    
    text = re.sub(r"\’", "\'", text)
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"don\'t", "do not", text)
    text = re.sub(r"dont", "do not", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"[0-9]", "digit", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)
    text = re.sub(r'[?|!|\'|"|#]',r'', text)
    text = re.sub(r'[.|,|)|(|\|/]',r' ', text)
    text = remove_stopwords(text)
    
    return text

In [36]:
train.Title = train.Title.apply(lambda x: clean_text(x))
test.Title = test.Title.apply(lambda x: clean_text(x))

## 2.4 Clean Body

In [37]:
import re

In [38]:
train['body_notag'] = train.Body.apply(lambda s: re.sub('<[^>]+>', '', s))
test['body_notag'] = test.Body.apply(lambda s: re.sub('<[^>]+>', '', s))

In [39]:
train.body_notag = train.body_notag.apply(lambda x: clean_text(x))
test.body_notag = test.body_notag.apply(lambda x: clean_text(x))

## 2.5 Combine Body and Title

In [40]:
train['all_text'] = train.Title + ' ' + train.body_notag
test['all_text'] = test.Title + ' ' + test.body_notag

# 3. Feature Engineering

## 3.1. Length of the Title & Question

In [41]:
train['title_length'] = train.Title.apply(lambda x: len(x.split()))
test['title_length'] = test.Title.apply(lambda x: len(x.split()))

In [42]:
train['body_length'] = train.body_notag.apply(lambda x: len(x.split()))
test['body_length'] = test.body_notag.apply(lambda x: len(x.split()))

# 4. Train-Val-Test Split

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
# 45000 * 0.22222222

In [45]:
train_df, val_df, _, _ = train_test_split(train, train['Y'], test_size=0.22222222, random_state=42)

In [46]:
train_df.shape, val_df.shape

((35000, 11), (10000, 11))

In [49]:
pickle.dump(train_df, open('../data/train_df.pkl', 'wb'))
pickle.dump(val_df, open('../data/val_df.pkl', 'wb'))
pickle.dump(test, open('../data/test_df.pkl', 'wb'))