In [19]:
import pandas as pd
import json
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HAFIZI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HAFIZI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HAFIZI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:

# Ensure NLTK resources are downloaded
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("NLTK stopwords not found. Downloading...")
    nltk.download('stopwords')
    print("NLTK stopwords downloaded.")
except Exception as e:
    print(f"An unexpected error occurred while checking/downloading NLTK stopwords: {e}")

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("NLTK 'punkt' tokenizer not found. Downloading...")
    nltk.download('punkt')
    print("NLTK 'punkt' tokenizer downloaded.")
except Exception as e:
    print(f"An unexpected error occurred while checking/downloading NLTK 'punkt' tokenizer: {e}")

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    print("NLTK 'wordnet' corpus not found. Downloading...")
    nltk.download('wordnet')
    print("NLTK 'wordnet' corpus downloaded.")
except Exception as e:
    print(f"An unexpected error occurred while checking/downloading NLTK 'wordnet' corpus: {e}")

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# --- Text Cleaning Function (Updated with Tokenization and Lemmatization) ---
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # 3. Remove mentions (@usernames)
    text = re.sub(r'@\w+', '', text)
    
    # 4. Remove hashtag symbols but keep the word (e.g., #awesome -> awesome)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # 5. Remove special characters and numbers, keeping only letters and spaces
    # This also effectively handles punctuation removal
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 6. Consolidate multiple spaces to single spaces and trim leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 7. Tokenization
    words = word_tokenize(text)
    
    # 8. Remove English stop words and Lemmatize
    stop_words = set(stopwords.words('english'))
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(cleaned_words)

NLTK 'wordnet' corpus not found. Downloading...
NLTK 'wordnet' corpus downloaded.


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HAFIZI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:

# --- Sentiment Labeling Function for Reddit Data (keyword-based) ---
positive_words = ['good', 'great', 'excellent', 'happy', 'love', 'best', 'amazing', 'nice', 'wonderful', 'awesome']
negative_words = ['bad', 'terrible', 'worst', 'sad', 'hate', 'awful', 'horrible', 'poor', 'angry', 'disappointed']

def label_sentiment_keyword_based(text):
    if not isinstance(text, str):
        return 2 # Neutral or undefined for non-string input

    positive_count = sum(1 for word in positive_words if word in text)
    negative_count = sum(1 for word in negative_words if word in text)

    if positive_count > negative_count:
        return 0  # Positive
    elif negative_count > positive_count:
        return 1  # Negative
    else:
        return 2  # Neutral


In [23]:

# --- Main Processing Logic ---
def process_data_and_combine(sentiment140_file_path, reddit_jsonl_files):
    # --- 1. Process Sentiment140 Data ---
    print("--- Loading and Initial Processing of Sentiment140 Data ---")
    sentiment140_column_names = ["polarity_raw", "tweet_id", "date", "query", "user", "tweet_text"]
    try:
        df_s140 = pd.read_csv(sentiment140_file_path, header=None, names=sentiment140_column_names, encoding='ISO-8859-1')
        print(f"Sentiment140 data loaded successfully from {sentiment140_file_path}")
    except FileNotFoundError:
        print(f"Error: Sentiment140.csv not found at {sentiment140_file_path}. Please ensure the path is correct.")
        df_s140 = pd.DataFrame(columns=['content', 'sentiment', 'source'])
    except Exception as e:
        print(f"Error loading Sentiment140 data: {e}")
        df_s140 = pd.DataFrame(columns=['content', 'sentiment', 'source'])

    if not df_s140.empty:
        df_s140 = df_s140.rename(columns={"tweet_text": "content"})
        # Map polarity_raw: 0 (negative) -> 1, 2 (neutral) -> 2, 4 (positive) -> 0
        df_s140['sentiment'] = df_s140['polarity_raw'].map({0: 1, 2: 2, 4: 0})
        df_s140 = df_s140.dropna(subset=['sentiment']).copy()
        df_s140['sentiment'] = df_s140['sentiment'].astype(int)
        df_s140['source'] = 'sentiment140'
        df_s140 = df_s140[['content', 'sentiment', 'source']]
        print(f"Sentiment140 data initial rows: {len(df_s140)}")
    else:
        print("Sentiment140 DataFrame is empty or failed to load.")

    # --- 2. Process Reddit Data ---
    print("\n--- Loading and Initial Processing of Reddit Data ---")
    all_reddit_data = []
    for file_path in reddit_jsonl_files:
        if not os.path.exists(file_path):
            print(f"Warning: Reddit file not found at {file_path}. Skipping.")
            continue
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    all_reddit_data.append(json.loads(line))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
            continue
    
    df_reddit = pd.DataFrame(all_reddit_data)

    if 'content' not in df_reddit.columns:
        print("Error: Reddit JSONL files do not contain a 'content' field. Reddit data will be empty.")
        df_reddit = pd.DataFrame(columns=['content', 'sentiment', 'source'])
    else:
        df_reddit = df_reddit[df_reddit['content'].notna() & (df_reddit['content'].str.strip() != '')].copy()
        df_reddit['sentiment'] = pd.NA # Placeholder for Reddit sentiment, will be filled after cleaning
        df_reddit['source'] = 'reddit'
        df_reddit = df_reddit[['content', 'sentiment', 'source']]
        print(f"Reddit data initial rows: {len(df_reddit)}")


    # --- 3. Combine DataFrames (text and initial labels) ---
    print("\n--- Combining DataFrames ---")
    if not df_s140.empty and not df_reddit.empty:
        combined_df = pd.concat([df_s140, df_reddit], ignore_index=True)
    elif not df_s140.empty:
        combined_df = df_s140
    elif not df_reddit.empty:
        combined_df = df_reddit
    else:
        combined_df = pd.DataFrame(columns=['content', 'sentiment', 'source'])
        print("No data was loaded from either source to combine.")
    
    if combined_df.empty:
        print("Combined DataFrame is empty. Exiting processing.")
        return combined_df # Return empty if nothing to process

    print(f"Total rows in combined DataFrame before cleaning: {len(combined_df)}")

    # --- 4. Apply Cleaning to Combined Data ---
    print("\n--- Applying Text Cleaning to Combined Data ---")
    combined_df['content'] = combined_df['content'].astype(str)
    combined_df['clean_comment'] = combined_df['content'].apply(clean_text)
    
    # Filter out rows where 'clean_comment' became empty after cleaning
    initial_cleaned_rows = len(combined_df)
    combined_df = combined_df[combined_df['clean_comment'].str.strip() != ''].copy()
    print(f"Rows after text cleaning: {len(combined_df)} (dropped {initial_cleaned_rows - len(combined_df)} empty clean comments)")

    # --- 5. Apply Sentiment Labeling for Reddit Rows (on cleaned text) ---
    print("\n--- Applying Sentiment Labeling for Reddit Data based on cleaned text ---")
    reddit_mask = (combined_df['source'] == 'reddit')
    if reddit_mask.any():
        combined_df.loc[reddit_mask, 'sentiment'] = combined_df.loc[reddit_mask, 'clean_comment'].apply(label_sentiment_keyword_based)
        combined_df['sentiment'] = combined_df['sentiment'].astype(int)
        print(f"Sentiment applied to {reddit_mask.sum()} Reddit rows.")
    else:
        print("No Reddit data found to label sentiment.")
        
    return combined_df[['content', 'clean_comment', 'sentiment']]

# --- Define raw data file paths using relative paths ---
current_script_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
project_root_dir = os.path.dirname(current_script_dir) 

raw_data_dir = os.path.join(project_root_dir, "data", "raw_data")
output_data_dir = os.path.join(project_root_dir, "data")

sentiment140_file_path = os.path.join(raw_data_dir, "Sentiment140.csv")
reddit_jsonl_files = [
    os.path.join(raw_data_dir, "reddit_malaysiauni_20250626_020648.jsonl"),
    os.path.join(raw_data_dir, "reddit_malaysian_20250626_014418.jsonl"),
    os.path.join(raw_data_dir, "reddit_malaysianfood_20250621_010756.jsonl"),
    os.path.join(raw_data_dir, "reddit_malaysian_20250626_013330.jsonl")
]

output_cleaned_data_path = os.path.join(output_data_dir, "cleaned_data.csv")

print(f"Expected raw data directory: {raw_data_dir}")
print(f"Expected output directory: {output_data_dir}")

# Process the data
final_cleaned_df = process_data_and_combine(sentiment140_file_path, reddit_jsonl_files)

# Save the final combined DataFrame to a CSV file
if not final_cleaned_df.empty:
    try:
        final_cleaned_df.to_csv(output_cleaned_data_path, index=False, encoding='utf-8')
        print(f"\nFinal combined cleaned and labeled data saved to {output_cleaned_data_path}")
        print(f"Final number of rows in cleaned_data.csv: {len(final_cleaned_df)}")
    except Exception as e:
        print(f"Error saving final cleaned data: {e}")
else:
    print("\nNo data to save as the final DataFrame is empty.")

print("\nProcessing complete.")

Expected raw data directory: c:\Users\HAFIZI\Documents\HPDP\reddit_sentiment_workflow\data\raw_data
Expected output directory: c:\Users\HAFIZI\Documents\HPDP\reddit_sentiment_workflow\data
--- Loading and Initial Processing of Sentiment140 Data ---
Sentiment140 data loaded successfully from c:\Users\HAFIZI\Documents\HPDP\reddit_sentiment_workflow\data\raw_data\Sentiment140.csv
Sentiment140 data initial rows: 1600000

--- Loading and Initial Processing of Reddit Data ---
Reddit data initial rows: 13614

--- Combining DataFrames ---
Total rows in combined DataFrame before cleaning: 1613614

--- Applying Text Cleaning to Combined Data ---
Rows after text cleaning: 1605730 (dropped 7884 empty clean comments)

--- Applying Sentiment Labeling for Reddit Data based on cleaned text ---
Sentiment applied to 13444 Reddit rows.

Final combined cleaned and labeled data saved to c:\Users\HAFIZI\Documents\HPDP\reddit_sentiment_workflow\data\cleaned_data.csv
Final number of rows in cleaned_data.csv: 