In [5]:
import pandas as pd
import re
import contractions
import html
import spacy
from nltk.corpus import stopwords

nlp = spacy.load('en_core_web_sm')

In [6]:
df = pd.read_csv(r"../data/raw_reviews.csv")
df['at'] = pd.to_datetime(df['at'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30946 entries, 0 to 30945
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              30946 non-null  object        
 1   userName              30946 non-null  object        
 2   userImage             30946 non-null  object        
 3   content               30937 non-null  object        
 4   score                 30946 non-null  int64         
 5   thumbsUpCount         30946 non-null  int64         
 6   reviewCreatedVersion  27933 non-null  object        
 7   at                    30946 non-null  datetime64[ns]
 8   replyContent          8529 non-null   object        
 9   repliedAt             8529 non-null   object        
 10  appVersion            27933 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 2.6+ MB


In [7]:
# Drop missing content
df = df.dropna(subset=["content"])
print(f"After dropping missing review: {len(df)}")

# Drop duplicates based on review
df = df.drop_duplicates(subset=["content"])
print(f"After dropping duplicates: {len(df)}")

# Keep only rows where 'review' is an actual string
df = df[df["content"].apply(lambda x: isinstance(x, str))]
print(f"After keeping only string-type reviews: {len(df)}")

After dropping missing review: 30937
After dropping duplicates: 28280
After keeping only string-type reviews: 28280


In [8]:
stop_words = set(stopwords.words("english"))
negation_words = {"no", "not", "nor", "never", "n't", "dont"}
stop_words = stop_words - negation_words
post_lemmatization_corrections = {
    "datum": "data",
    "cannot": "can_not",
    "dont": "do_not",
    "doesnt": "does_not",
    "wont": "will_not",
    "cant": "can_not",
    "isnt": "is_not",
    "wasnt": "was_not",
    "arent": "are_not"
}

def sentiment_preprocessing(text: str) -> str:
    # 1. Decode HTML entities: &amp; → &, etc.
    text = html.unescape(text)

    # 2. Normalize curly quotes to straight quotes
    text = re.sub(r'[“”]', '"', text)        # curly double quotes
    text = re.sub(r"[‘’]", "'", text)        # curly single quotes

    # 3. Collapse duplicate quotes ("" → ")
    text = re.sub(r'""', '"', text)
    text = re.sub(r"''", "'", text)

    # 4. Remove literal \n, \t, \r from escaped strings
    text = re.sub(r'\\[nrt]+', ' ', text)

    # 5. Remove URLs and mentions
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)

    # 6. Expand contractions (can't → can not)
    text = contractions.fix(text)

    # 7. Lowercase the text
    text = text.lower()

    # 8. Keep only useful punctuation: ! ? % '
    #    Remove: . , : ; ( ) etc.
    text = re.sub(r"[^\w\s!?%']", " ", text)

    # 9. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize & lemmatize
    doc = nlp(text)

    tokens = []
    skip_next = False

    for i, token in enumerate(doc):
        if skip_next:
            skip_next = False
            continue

        lemma = token.lemma_.lower()
        lemma = post_lemmatization_corrections.get(lemma, lemma)

        # Preserve negation + meaningful word (negation tagging)
        if lemma in negation_words and i + 1 < len(doc):
            next_token = doc[i + 1]
            if next_token.pos_ in {"ADJ", "VERB", "ADV", "NOUN"}:
                next_lemma = next_token.lemma_.lower()
                next_lemma = post_lemmatization_corrections.get(next_lemma, next_lemma)
                tokens.append(f"{lemma}_{next_lemma}")
                skip_next = True
                continue
            else:
                tokens.append(lemma)
        elif lemma not in stop_words and token.is_alpha and len(lemma) > 1:
            tokens.append(lemma)

    return " ".join(tokens)

df["cleaned_content"] = df["content"].apply(sentiment_preprocessing)

In [9]:
# Drop missing content
df = df.dropna(subset=["cleaned_content"])
print(f"After dropping missing review: {len(df)}")

# Drop duplicates based on review
df = df.drop_duplicates(subset=["cleaned_content"])
print(f"After dropping duplicates: {len(df)}")

# Keep only rows where 'review' is an actual string
df = df[df["cleaned_content"].apply(lambda x: isinstance(x, str))]
print(f"After keeping only string-type reviews: {len(df)}")

After dropping missing review: 28280
After dropping duplicates: 26500
After keeping only string-type reviews: 26500


In [10]:
df.to_csv(r"../data/cleaned_reviews.csv", index=False)
df[["content", "cleaned_content"]].head()

Unnamed: 0,content,cleaned_content
0,EDIT - Komoot is the best app for walks!.. Rea...,edit komoot good app walk really good app sham...
1,the offline maps and route tracking are very u...,offline map route tracking useful
2,Nice,nice
3,are you insane? 800$ yearly for the app and I ...,insane yearly app not_use buy campgroud trail ...
4,sooooo many cool trails :),sooooo many cool trail
