In [1]:
import pandas as pd
import numpy as np
import math

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
TEST_SIZE = 0.2
DATASET = 'datasets/datasetall.csv'

SAVE_DATA = False
SAVE_TRAIN_DATASET_FILENAME = 'datasets/dataset-train.csv'
SAVE_TEST_DATASET_FILENAME = 'datasets/dataset-test.csv'

In [3]:
def read_csv_file(filename: str) -> pd.DataFrame:
    try:
        data = pd.read_csv(filename, lineterminator='\n', usecols=range(2))
        print("CSV file read successfully!")
        return data
    except FileNotFoundError:
        print("ERROR: File not found")
        exit(1)

dataset = read_csv_file(DATASET)
dataset

CSV file read successfully!


Unnamed: 0,text,label
0,Binay: Patuloy ang kahirapan dahil sa maling p...,0
1,SA GOBYERNONG TAPAT WELCOME SA BAGUO ANG LAHAT...,0
2,wait so ur telling me Let Leni Lead mo pero NY...,1
3,[USERNAME]wish this is just a nightmare that ...,0
4,doc willie ong and isko sabunutan po,0
...,...,...
28456,"Bisaya, Probinsyano/a, mostly Bisaya = katulong",1
28457,Amnesia. In my whole life wala pa ako nakasala...,1
28458,Kontrabida na ilang beses na tinalo at obvious...,1
28459,Yung antagonist laging kailangang sobrang sama...,1


In [4]:
dataset['label'].value_counts(ascending=True)

label
0    14115
1    14346
Name: count, dtype: int64

In [5]:
random_number_generator = np.random.default_rng()
# random_number_generator = np.random.default_rng(seed=0)
def shuffle_data_frame(data_frame):
    text = list(data_frame['text'])
    label = list(data_frame['label'])

    assert(len(text) == len(label))

    indices = list(range(len(label)))

    # Make a random number generator that will shuffle list of indices
    # It is seeded to be reproducible
    random_number_generator.shuffle(indices)

    shuffled_text = []
    shuffled_labels = []

    # Iterate through the list of indices and add the original data
    # from those shuffled indices
    for index in indices:
        shuffled_text.append(text[index])
        shuffled_labels.append(label[index])

    return pd.DataFrame({
        'text': shuffled_text,
        'label': shuffled_labels,
    })


def get_train_test_split(data_frame: pd.DataFrame, test_size: float):
    """
    Makes a stratified train test split.
    This aims to preserve the distribution between classes.
    """
    if not (1 > test_size > 0):
        print('ERROR: test_size must be between 0 and 1')
        return

    data_frame = shuffle_data_frame(data_frame)

    data_frame_length = len(data_frame)
    train_size = 1 - test_size

    nonhate_rows = data_frame[data_frame['label'] == 0] 
    nonhate_row_length = len(nonhate_rows)

    nonhate_row_train_size = math.ceil(nonhate_row_length * train_size)

    nonhate_row_train = nonhate_rows[0:nonhate_row_train_size]
    nonhate_row_test = nonhate_rows[nonhate_row_train_size:nonhate_row_length]

    assert(len(nonhate_row_train) + len(nonhate_row_test) == nonhate_row_length)

    hate_rows = data_frame[data_frame['label'] == 1] 
    hate_row_length = len(hate_rows)

    hate_row_train_size = math.ceil(hate_row_length * train_size)

    hate_row_train = hate_rows[0:hate_row_train_size]
    hate_row_test = hate_rows[hate_row_train_size:hate_row_length]

    assert(len(hate_row_train) + len(hate_row_test) == hate_row_length)

    combined_train = pd.concat([nonhate_row_train, hate_row_train])
    combined_test = pd.concat([nonhate_row_test, hate_row_test])

    shuffled_train = shuffle_data_frame(combined_train)
    shuffled_test = shuffle_data_frame(combined_test)

    return (
        shuffled_train['text'],
        shuffled_test['text'],
        shuffled_train['label'],
        shuffled_test['label'],
    )

In [6]:
X_train, X_test, y_train, y_test = get_train_test_split(dataset, TEST_SIZE)

In [7]:
pd.DataFrame({
  'text': X_train,
  'label': y_train,
})

Unnamed: 0,text,label
0,Matthew Chang [USERNAME] Remind ko lang di ba ...,1
1,Yay! The interview served its purpose wellJess...,0
2,I say DASURV,0
3,TayNew said Let Leni Lead,0
4,Gloc 9 is not endorsing Jejomar Binay as his p...,0
...,...,...
22764,Nov. 11: on [USERNAME] saw tv ads of Jojo Bina...,1
22765,Mar Roxas your call for unity describes one th...,1
22766,Buti nalang nagdecide nakong hindi manood ng T...,0
22767,sang boto para sa pagbabago. Let Leni Lead phi...,0


In [8]:
y_train.value_counts(ascending=True)

label
0    11292
1    11477
Name: count, dtype: int64

In [9]:
pd.DataFrame({
  'text': X_test,
  'label': y_test,
})

Unnamed: 0,text,label
0,Hindi susuportahan ng theatre and literary est...,0
1,BABAE LABAN SA FAKE AT FRAUDBFFSUMBONGDAYA DES...,1
2,Im proud to be a Filipino and a kakampink like...,0
3,Grabe noThe hypocrisy of the church to preach ...,1
4,BBMSARAUniteam Ph Arena BBMSARA,0
...,...,...
5687,[USERNAME] Rizalito David is a good man you ca...,0
5688,A very famous religious cult in the Philippine...,1
5689,Tama sir VP Leni Di dapat iboto SI BBM Kase No...,0
5690,RT [USERNAME]: Mar Roxas forever arrogantI can...,1


In [10]:
y_test.value_counts(ascending=True)

label
0    2823
1    2869
Name: count, dtype: int64

## Saving Data

In [None]:
if SAVE_DATA:
  pd.DataFrame({
    'text': X_train,
    'label': y_train,
  }).to_csv(SAVE_TRAIN_DATASET_FILENAME, index=False)

In [None]:
if SAVE_DATA:
  pd.DataFrame({
    'text': X_test,
    'label': y_test,
  }).to_csv(SAVE_TEST_DATASET_FILENAME, index=False)