Data Cleaning 
This notebook cleans the review data by putting it to lowercase, removing special characters and numbers, removing duplicates and incomplete reviews. The file is then saved to data/intermediate.

In [None]:
import pandas as pd
import os
import sys

# --- Project Directory Setup ---
# Get the parent directory (project root)
sys.path.append(os.path.abspath(".."))

from src1 import load_json_lines, clean_text

In [None]:
# --- File Paths Setup ---
# Base directory (go 2 levels up from /src1/)
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Define input/output directories using relative paths
input_dir = os.path.join(base_dir, "data", "raw")
output_dir = os.path.join(base_dir, "data", "intermediate")

input_filename = "reviews_2021-01.json"
input_file = os.path.join(input_dir, input_filename)

cleaned_file = os.path.join(output_dir, f"cleaned_{input_filename}")
na_file = os.path.join(output_dir, "NAs.json")
duplicates_file = os.path.join(output_dir, "Duplicates.json")

In [None]:
# --- Check if input file exists ---
if not os.path.exists(input_file):
    raise FileNotFoundError(f"Error: File '{input_file}' not found.")

# --- Load Data ---
reviews = load_json_lines(input_file)
df = pd.DataFrame(reviews)

# --- DataFrame creation ---
df = pd.DataFrame(reviews)

# --- Initial Checks ---
print(f"Initial data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Missing values per column:\n{df.isnull().sum()}")

In [None]:
# --- Text Cleaning ---
if "text" in df.columns:
    df["text"] = df["text"].apply(clean_text)  # Clean the text field
    df["text"] = (
        df["text"].str.replace(r"\s+", " ", regex=True).str.strip()
    )  # Normalize whitespace

In [None]:
# --- Remove Duplicates ---
duplicates = df[df.duplicated(subset=["review_id"], keep=False)]
if not duplicates.empty:
    print(
        f"Found {duplicates.shape[0]} duplicate reviews. Saving to '{duplicates_file}'."
    )
    duplicates.to_json(duplicates_file, orient="records", lines=True, force_ascii=False)

df = df.drop_duplicates(subset=["review_id"], keep="first")  # Remove duplicate reviews

In [None]:
# --- Remove Empty Reviews ---
df = df[df["text"].str.len() > 0]  # Remove reviews with empty text

# --- Handle Missing Values ---
nas = df[df.isnull().any(axis=1)]
if not nas.empty:
    print(f"Found {nas.shape[0]} rows with missing values. Saving them to '{na_file}'.")
    nas.to_json(na_file, orient="records", lines=True, force_ascii=False)

df = df.dropna()  # Drop rows with any missing values
print(f"Data shape after handling NAs: {df.shape}")

In [None]:
# --- Save Cleaned Data ---
df.to_json(cleaned_file, orient="records", lines=True, force_ascii=False)
print(f"Cleaned data saved to '{cleaned_file}'")