## **Data Ingestion:**



In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# Define BASE_FOLDER globally
BASE_FOLDER = "data"

def make_dir():
    """Creates required directories if they don't exist."""
    sub_dirs = ["raw", "raw/train", "raw/test"]  # Fixed missing comma
    for sub in sub_dirs:
        os.makedirs(os.path.join(BASE_FOLDER, sub), exist_ok=True)

def load_data(url):
    """Loads data from the provided URL."""
    return pd.read_csv(url)

def processing(df):
    """Filters, maps sentiment values, and shuffles data."""
    return (
        df[df["sentiment"].isin(["happiness", "sadness"])]
        .drop(columns=["tweet_id"], errors="ignore")  # Avoid KeyError if column is missing
        .assign(sentiment=lambda x: x["sentiment"].map({"happiness": 1, "sadness": 0}))
        .sample(frac=1, random_state=42)  # Shuffle the data
    )

def save_data(final_df):
    """Splits data into train and test sets and saves them."""
    train_df, test_df = train_test_split(final_df, test_size=0.2, random_state=42)

    train_df.to_csv(os.path.join(BASE_FOLDER, "raw/train", "train.csv"), index=False)
    test_df.to_csv(os.path.join(BASE_FOLDER, "raw/test", "test.csv"), index=False)

def main():
    make_dir()
    url = "https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv"

    df = load_data(url)
    final_df = processing(df)
    save_data(final_df)

if __name__ == "__main__":
    main()


## **Data cleaning:**

In [2]:
import re
# NLTK for natural language processing
import nltk
from nltk.corpus import stopwords    # For stopwords
from nltk.tokenize import word_tokenize # For tokenization
from nltk.stem import PorterStemmer, WordNetLemmatizer # For stemming and lemmatization

# Downloading NLTK data
nltk.download('stopwords')   # Downloading stopwords data
nltk.download('punkt')       # Downloading tokenizer data
nltk.download('wordnet')     # Downloading WordNet data for lemmatization

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:


def data_cleaning(text_series):
    """Cleans the text data by removing URLs, emails, numbers, and punctuation."""
    number_pattern = r"(?<=\D)\d+|\d+(?=\D)"  # Removes numbers but keeps letters
    url_pattern = r"https?://\S+|www\.\S+"
    email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    punctuation_pattern = r"[^\w\s]"

    return (
        text_series.astype(str)  # Ensure text is string
        .str.lower()
        .str.replace(url_pattern, " ", regex=True)
        .str.replace(email_pattern, " ", regex=True)
        .str.replace(number_pattern, " ", regex=True)
        .str.replace(punctuation_pattern, " ", regex=True)
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)  # Normalize spaces
    )

def remove_short_words(text_series, min_length=3):
    """Removes words shorter than `min_length` characters."""
    return text_series.apply(lambda x: " ".join([word for word in x.split() if len(word) >= min_length]))

def lemmatization(text_series):
    """Lemmatizes words using WordNetLemmatizer."""
    lemmatizer = WordNetLemmatizer()
    return text_series.apply(lambda x: " ".join([lemmatizer.lemmatize(word, pos="v") for word in x.split()]))

def remove_stopwords(text_series):
    """Removes stopwords from text."""
    stop_words = frozenset(stopwords.words("english"))  # Faster lookup
    return text_series.apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

def normalize(df):
    """Applies text preprocessing steps."""
    df["content"] = data_cleaning(df["content"])
    df["content"] = remove_short_words(df["content"])
    df["content"] = lemmatization(df["content"])
    df["content"] = remove_stopwords(df["content"])
    return df

def main():
    train_data = pd.read_csv("/content/data/raw/train/train.csv")
    test_data = pd.read_csv("/content/data/raw/test/test.csv")

    # Transform the data
    train_processed_data = normalize(train_data)
    test_processed_data = normalize(test_data)

    # Store the data inside data/processed
    data_path = os.path.join("./data", "interim")
    os.makedirs(data_path, exist_ok=True)

    train_processed_data.to_csv(os.path.join(data_path, "train_processed.csv"), index=False)
    test_processed_data.to_csv(os.path.join(data_path, "test_processed.csv"), index=False)


if __name__ == "__main__":
    main()
