In [None]:
!pip install langdetect

In [None]:
!pip install Sastrawi

## Importing Packages

Import all essential libraries and frameworks for comprehensive text preprocessing, including pandas for data manipulation, numpy for numerical operations, matplotlib/seaborn for visualization, NLTK for natural language processing, scikit-learn for text vectorization, and specialized libraries for language detection and text cleaning operations.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException

import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

import warnings
warnings.filterwarnings('ignore')

## Loading Data

In this section you are required to load data from local and then extract it to the storage directory.


In [None]:
dataframe = pd.read_csv('traveloka_assessment.csv')

In [None]:
dataframe.head(10)

In [None]:
dataframe['reviewCreatedVersion'].drop_duplicates()

## Data Preprocessing

Conduct initial data exploration and preparation steps including checking data types, identifying missing values, examining data distribution, and understanding the overall structure of the dataset. This foundational analysis guides the subsequent preprocessing strategy.

In [None]:
dataframe.info()

In [None]:
dataframe.dtypes

In [None]:
print(dataframe.shape)
print("\n Check Is Null")
print(dataframe.isnull().sum())
print("\n Check Duplicated")
print(dataframe.duplicated().sum())

In [None]:
datacheck = ['content', 'score', 'thumbsUpCount']
dataframe[datacheck].head(10)

## Clean Data (Text Normalization)

Perform the first phase of data cleaning focusing on basic text normalization. This includes handling missing values, removing duplicates, standardizing text encoding, and addressing any structural inconsistencies in the dataset that could affect downstream processing.

In [None]:
# Deteksi bahasa untuk setiap review
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

In [None]:
dataframe['language'] = dataframe['content'].apply(detect_language)

In [None]:
dataframe['language'].value_counts()

In [None]:
# remove that languange non 'id'
dataframe = dataframe[dataframe['language'] == 'id'].reset_index(drop=True)

In [None]:
df = dataframe.copy()
df = df[['content', 'score']]
df.head(10)

In [None]:
# create labeling function
def label_score(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

In [None]:
df['sentiment'] = df['score'].apply(label_score)
df['sentiment'].value_counts()

## Cleaning Data (basic version + Stopwords + Stemming)

Implement comprehensive text cleaning pipeline that includes:

- <b>Basic Cleaning:</b> Remove special characters, URLs, email addresses, and unwanted symbols
- <b>Stopwords Removal:</b> Eliminate common words that don't contribute to sentiment or meaning
- <b>Stemming:</b> Reduce words to their root form using algorithms like Porter Stemmer to normalize word variations and reduce dimensionality

Key Features:

- <b>Case normalization </b> (converting to lowercase)
- <b>Punctuation removal and handling</b>
- <b>Number and digit processing</b>
- <b>HTML tag removal if present</b>
- <b>Whitespace normalization</b>
- <b>Language-specific stopword filtering</b>
- <b>Root word extraction through stemming algorithms</b>

In [None]:
# initialize stemmer and stopword remover
stemmer = StemmerFactory().create_stemmer()
stop_factory = StopWordRemoverFactory()
stopwords = set(stop_factory.get_stop_words())

In [None]:
# function to load slang words
def load_slang_dict(file_path='slangs.txt'):
    """Load slang dictionary from file"""
    slang_dict = {}
    try:
        with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
            for line in file:
                line = line.strip()
                if ':' in line:
                    parts = line.split(':', 1)
                    slang = parts[0].strip().lower()
                    formal = parts[1].strip().lower()
                    slang_dict[slang] = formal
        print(f"Loaded {len(slang_dict)} slang words from {file_path}")
    except FileNotFoundError:
        print(f"Warning: {file_path} not found. Slang normalization will be skipped.")
    except Exception as e:
        print(f"Error loading slang dictionary: {e}")
    return slang_dict


In [None]:
# Load slang dictionary
slang_dict = load_slang_dict('slangs.txt')

In [None]:
def normalize_slang(text, slang_dict):
    """Replace slang words with formal words"""
    if not slang_dict:
        return text

    words = text.split()
    normalized_words = []

    for word in words:
        # Check if word exists in slang dictionary
        if word.lower() in slang_dict:
            formal_word = slang_dict[word.lower()]
            # Only add if formal word is not empty
            if formal_word.strip():
                normalized_words.append(formal_word)
        else:
            normalized_words.append(word)

    return ' '.join(normalized_words)

In [None]:
def clean_text(text):
    """
    Comprehensive text cleaning with slang normalization
    """
    # Remove mentions, hashtags, RT
    text = re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+|RT\s+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Remove numbers
    text = re.sub(r'[0-9]+', '', text)

    # Lowercase
    text = text.lower()

    # Normalize slang words BEFORE removing punctuation
    text = normalize_slang(text, slang_dict)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove non-alphabetic characters, keep only letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize and remove stopwords + short words
    tokens = [word for word in text.split()
              if word not in stopwords and len(word) > 3]

    # Stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(stemmed_tokens)

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
df['clean_content'] = df['content'].progress_apply(clean_text)

In [None]:
df['clean_content'].head(10)

## Save Clean Data

In [None]:
df.to_csv('traveloka_clean.csv', index=False)

In [None]:
cleandf = pd.read_csv('traveloka_clean.csv')

In [None]:
cleandf.head()