# Course Reviews Text Preprocessing Pipeline

This notebook performs comprehensive text preprocessing on course reviews including:
- Language detection (English only)
- Lowercasing
- Punctuation removal
- Stopword removal
- Non-English word filtering


## 1. Setup and Imports

In [47]:
import pandas as pd
import string
import nltk
from collections import Counter
from langdetect import detect, LangDetectException
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, words

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\M.S.I\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\M.S.I\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\M.S.I\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## 2. Data Loading

In [48]:
def load_data(filepath):
    """Load and prepare the raw dataset"""
    df = pd.read_csv(filepath)
    
    # Validate required columns
    if 'reviews' not in df.columns:
        raise ValueError("Column 'reviews' not found in dataset")
    
    # Select only reviews column and drop NA
    return df[['reviews']].dropna()

## 3. Preprocessing Function

In [49]:
def preprocess_text(text, english_vocab, stop_words, spanish_words):
    """Clean and process individual text"""
    # Lowercase and remove punctuation/numbers
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    text = ''.join([i for i in text if not i.isdigit()])
    
    # Tokenize and filter
    tokens = word_tokenize(text)
    return ' '.join([
        word for word in tokens 
        if (word not in stop_words and 
            word not in spanish_words and
            word in english_vocab)
    ])

## 4. Main Processing Pipeline

In [54]:
# Load data
df = load_data('../datasets/course_reviews.csv')

# Before processing stats
print("=== BEFORE PROCESSING ===")
print(f"Total reviews: {len(df)}")
print(f"Total words: {df['reviews'].apply(lambda x: len(x.split())).sum()}")
print(f"Unique words: {len(Counter(' '.join(df['reviews']).split()))}")

# Initialize filters
stop_words = set(stopwords.words('english'))
spanish_words = {"de", "curso", "que", "la", "el", "en", "para", "muy"}
english_vocab = set(words.words())

# Process reviews
processed_reviews = []
for review in df['reviews']:
    try:
        if detect(review) == 'en':  # English only
            cleaned = preprocess_text(review, english_vocab, stop_words, spanish_words)
            if len(cleaned.split()) >= 3:  # Minimum 3 words
                processed_reviews.append(cleaned)
    except LangDetectException:
        continue

# Create output DataFrame
df_processed = pd.DataFrame({'processed_reviews': processed_reviews})

# After processing stats
print("\n=== AFTER PROCESSING ===")
print(f"Processed reviews: {len(df_processed)}")
print(f"Total words: {df_processed['processed_reviews'].apply(lambda x: len(x.split())).sum()}")
print(f"Unique words: {len(Counter(' '.join(df_processed['processed_reviews']).split()))}")

=== BEFORE PROCESSING ===
Total reviews: 1454558
Total words: 32575387
Unique words: 283922

=== AFTER PROCESSING ===
Processed reviews: 1017953
Total words: 11848738
Unique words: 17136


## 5. Saving Processed Reviews

In [55]:
output_path = '../datasets/preprocessed_coursera_review.csv'
df_processed.to_csv(output_path, index=False)
print(f"\nSaved ONLY processed reviews to: {output_path}")
print("Sample processed reviews:")
print(df_processed.head(3))


Saved ONLY processed reviews to: ../datasets/preprocessed_coursera_review.csv
Sample processed reviews:
                                   processed_reviews
0  pretty dry able pass two complete happy usual ...
1  would better experience video screen would sho...
2  information perfect program little annoying wa...
