# Using Assignment 1 Code to Train Model

# Importing Required Libraries

In [2]:
import pandas as pd
from typing import Tuple
from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords (if not already downloaded)

In [3]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /home/gg8576/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/gg8576/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Defining Necessary Functions

In [4]:
# Function to load data from a given file path
def load_data(file_path: str) -> pd.DataFrame:
    data = pd.read_csv(file_path)
    return data

# Function to preprocess the text data
def preprocess_text(text: str) -> str:
    # Converting to lowercase
    text = text.strip().lower()

    # Removing special characters, numbers, and extra whitespaces
    text = re.sub(r"[^a-zA_Z\d\s]", "", text)

    # Removing stopwords
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # Joining the filtered words back into a string
    processed_text = " ".join(filtered_text)

    return processed_text

# Function to preprocess the data
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    data = data.dropna()

    # Applying text preprocessing to the "text" column
    data["text"] = data["text"].apply(preprocess_text)

    # Dropping duplicate texts
    data.drop_duplicates("text", inplace = True)

    return data

# Function to split the data into train/validation/test sets
def split_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # Splitting the data into 80% training, 10% validation, and 10% test
    train, test = train_test_split(data, test_size = 0.2, random_state = 42)
    validation, test = train_test_split(test, test_size = 0.5, random_state = 42)

    return train, validation, test

# Function to store the splits at train.csv, validation.csv, and test.csv
def store_splits(train: pd.DataFrame, validation: pd.DataFrame, test: pd.DataFrame) -> None:
    train.to_csv("train.csv", index = False)
    validation.to_csv("validation.csv", index = False)
    test.to_csv("test.csv", index = False)

# Loading, Splitting and Storing the Data

In [5]:
# loading data
file_path = "emails.csv"
data = load_data(file_path)

# preprocessing data
data = preprocess_data(data)

# splitting and writing data
train, validation, test = split_data(data)
store_splits(train, validation, test)