In [1]:
import pandas as pd
import os

from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
train_path = 'data/train/'
test_path = 'data/test/'

In [3]:
def generate_df(path, label):
    file_names = os.listdir(path)

    texts = []
    for file_name in file_names:
        if file_name.endswith('.txt'):
            with open(path + file_name, 'r') as file:
                text = file.read()

            texts.append(text)

    df = pd.DataFrame(texts, columns = ['text'])
    df['label'] = label
            
    return df

In [4]:
train_pos = generate_df(train_path + 'pos/', 1)
train_neg = generate_df(train_path + 'neg/', 0)

test_pos = generate_df(test_path + 'pos/', 1)
test_neg = generate_df(test_path + 'neg/', 0)

train_df = pd.concat([train_pos, train_neg]).reset_index(drop=True)
test_df = pd.concat([test_pos, test_neg]).reset_index(drop=True)

In [5]:
train_df.to_csv('data/train.csv', index = False)
test_df.to_csv('data/test.csv', index = False)

In [6]:
train_df.head()

Unnamed: 0,text,label
0,"After ""Beau travail"", everybody was waiting fo...",1
1,This is the best series of its type I've seen ...,1
2,There is a scene in Dan in Real Life where the...,1
3,Most war films made in the US during WWII were...,1
4,Actually one particular person/character isn't...,1


In [7]:
print('Train:')
print(train_df['label'].value_counts())

print('Test:')
print(test_df['label'].value_counts())

Train:
1    12500
0    12500
Name: label, dtype: int64
Test:
1    12500
0    12500
Name: label, dtype: int64


In [8]:
def process_text(text):
    processed_text = text.replace('<br />', ' ')
    processed_text = processed_text.replace('"', '')
    processed_text = processed_text.replace('(', '')
    processed_text = processed_text.replace(')', '')
    processed_text = processed_text.replace('!', '')
    processed_text = processed_text.replace('...', '')
    processed_text = processed_text.lower()
    
    processed_text = tokenizer.tokenize(processed_text)
    
    processed_text = [lemmatizer.lemmatize(word, "v") for word in processed_text]
    processed_text = [lemmatizer.lemmatize(word, "a") for word in processed_text]
    
    processed_text = detokenizer.detokenize(processed_text)
    
    return processed_text

In [9]:
train_df["text"] = train_df["text"].apply(process_text)
test_df["text"] = test_df["text"].apply(process_text)

In [10]:
train_df.to_csv('data/processed_train.csv', index = False)
test_df.to_csv('data/processed_test.csv', index = False)