In [12]:
import sys
sys.path.append("../../")

In [14]:
import pandas as pd
from pathlib import Path

In [27]:
notebook_path = Path().absolute()
base_path = notebook_path.parent.parent / "data"
data_file_path = base_path / "sms_spam.tsv"

In [16]:
df = pd.read_csv(
    data_file_path, 
    sep="\t", 
    header=None, 
    names=["Label", "Text"]
)

In [17]:
df.head(5)

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [18]:
df["Label"].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

## [How to handle class imbalance?](https://imbalanced-learn.org/stable/user_guide.html)

In [20]:
# Fix class imbalance by undersampling the dataset
def create_balanced_dataset(df):
    num_spam = df["Label"].value_counts()["spam"]
    ham_subset = df.loc[df["Label"] == "ham"].sample(n=num_spam, random_state=42)

    balanced_df = pd.concat([ham_subset, df.loc[df["Label"] == "spam"]], axis=0)
    
    return balanced_df

In [23]:
balanced_df = create_balanced_dataset(df)

balanced_df["Label"].value_counts()

Label
ham     747
spam    747
Name: count, dtype: int64

In [24]:
balanced_df["Label"] = balanced_df["Label"].map(lambda x: 1 if x == "spam" else 0) # df_b["Label"].map({"spam": 1, "ham": 0})

In [25]:
def create_random_split(df, train_frac, val_frac):
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    train_split = int(df.shape[0] * train_frac) # int(len(df) * train_frac)
    val_split = int(df.shape[0] * val_frac) # int(len(df) * val_frac)

    train_df = df.iloc[: train_split]
    val_df = df.iloc[train_split: train_split + val_split]
    test_df = df.iloc[train_split + val_split:]

    return train_df, val_df, test_df

In [26]:
train_frac = 0.7
val_frac = 0.1
train_df, val_df, test_df = create_random_split(balanced_df, train_frac, val_frac)

In [29]:
train_df.to_csv(base_path / "sms_spam_train.tsv", sep="\t", index=None)
val_df.to_csv(base_path / "sms_spam_val.tsv", sep="\t", index=None)
test_df.to_csv(base_path / "sms_spam_test.tsv", sep="\t", index=None)