# Parameters

In [1]:
raw_data_path = 'data/train.txt'
val_data_path = 'data/val.txt'
destination_folder = 'data'

train_test_ratio = 0.10
train_valid_ratio = 0.80

first_n_words = 200

# Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# Preprocessing

In [3]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

In [4]:
# Read raw data
df_raw = pd.read_csv(raw_data_path,delimiter = ";",header=None).rename(columns={0:"text",1:"label"})
df_val = pd.read_csv(val_data_path,delimiter = ";",header=None).rename(columns={0:"text",1:"label"})

In [5]:
df_val.head()

Unnamed: 0,text,label
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


In [6]:
# Prepare columns
df_raw['label'] = le.fit_transform(df_raw['label']).astype('int')
df_raw = df_raw.reindex(columns=['label', 'text'])
# Drop rows with empty text
df_raw.drop( df_raw[df_raw.text.str.len() < 5].index, inplace=True)
# Trim text and titletext to first_n_words
df_raw['text'] = df_raw['text'].apply(trim_string)

df_val['label'] = le.fit_transform(df_val['label']).astype('int')
df_val = df_val.reindex(columns=['label', 'text'])
df_val.drop( df_val[df_val.text.str.len() < 5].index, inplace=True)
df_val['text'] = df_val['text'].apply(trim_string)

In [12]:
import numpy
numpy.save('classes.npy', le.classes_)

In [28]:
# Train-test split
df_train,df_test = train_test_split(df_raw, train_size = train_test_ratio, random_state = 1)

# Train-valid split
# Write preprocessed data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_val.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)