# Text Cleaning & Preprocessing

This notebook cleans raw LinkedIn text data
to prepare it for sentiment analysis.


In [2]:
# Data manipulation library
import pandas as pd

# Regular expressions for text cleaning
import re



In [6]:
import os

# Path to raw data file
file_path = "../data/raw/youtube_raw.csv"

try:
    # Load raw CSV data
    df = pd.read_csv(file_path)
    print(f"Loaded {len(df)} rows from {file_path}")

    # Preview dataset
    df.head()

except FileNotFoundError:
    # Handle missing file error
    print(f"File not found: {file_path}. Please check the path.")

except Exception as e:
    # Handle any other unexpected errors
    print(f"Error loading file: {e}")



Loaded 74 rows from ../data/raw/youtube_raw.csv


In [8]:
# Load NLP tools
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Prepare stopwords and lemmatizer
stop_words_en = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """
    Clean and normalize social media text for NLP tasks
    """
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove mentions and hashtags
    text = re.sub(r"@\w+|#\w+", "", text)

    # Keep only letters (English + Persian)
    text = re.sub(r"[^a-zA-Zآ-ی\s]", "", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Remove stopwords and apply lemmatization
    words = [
        lemmatizer.lemmatize(w)
        for w in text.split()
        if w not in stop_words_en
    ]

    return " ".join(words)


In [11]:

if "content" in df.columns:
    df["cleaned_content"] = df["content"].apply(clean_text)
    print(f"Applied text cleaning on {len(df)} rows.")
    df.head()
else:
    print("Column 'content' not found in DataFrame.")


Applied text cleaning on 74 rows.


In [12]:
df.to_csv("../data/processed/youtube.csv", index=False)


In [10]:
df

Unnamed: 0,content,date,cleaned_content
0,Be careful with how many treats the cat gets.....,2025-12-24,careful many treat cat get right due treat goi...
1,I’m hoping they bring back the plus. Im using ...,2025-12-24,im hoping bring back plus im using plus upgrad...
2,"Got my I Phone 17 2 days ago, my battery exper...",2025-12-24,got phone day ago battery experience bad got t...
3,Watching on my 17.,2025-12-24,watching
4,"Adding 12 GB of RAM, studio-quality mics, and ...",2025-12-24,adding gb ram studioquality mics usb would mak...
...,...,...,...
69,I agree that the iPhone 13 lacks several featu...,2025-12-24,agree iphone lack several feature understandab...
70,"How the heck are 3 months ""long term""?",2025-12-24,heck month long term
71,People with ADHD would notice and love it,2025-12-24,people adhd would notice love
72,"interesting comparison, thanks. just a small t...",2025-12-24,interesting comparison thanks small thing pay ...
