In [1]:
# Data Wrangling
import pandas as pd
import numpy as np

# Eventual Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Text Processing
import emoji
import re
import string

# NLP
import spacy

# Language Detection
from transformers import pipeline

# Spell Checking
import enchant
from autocorrect import Speller
from enchant.checker import SpellChecker

2024-03-11 10:05:30.881489: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Ready scraped data
df = pd.read_csv("./data/merged_output.csv")

In [3]:
df.loc[(df["offer_ref"].isin([119352308, 126556830, 97049775,]) 
        & df["entry_id"].isin([16919388, 16453304, 16368349,])),
        "review_text"]

Series([], Name: review_text, dtype: object)

In [4]:
# Delete reviews written in cyrillic with a mix of polish characters (Those cases are not handled by cyryllic fix later on)
df = df[~(df["offer_ref"].isin([119352308, 126556830, 97049775,]) 
          & df["entry_id"].isin([16919388, 16453304, 16368349,]))]

In [5]:
other_langs = pd.read_csv("./data/foreign_languages.csv")

In [6]:
df.loc[df["entry_id"].isin(other_langs["entry_id"]) 
       & df["offer_ref"].isin(other_langs["offer_ref"]), 
       "review_text"].head(5)

Series([], Name: review_text, dtype: object)

In [7]:
# Delete reviews written in other languages
df = df.loc[~(df["entry_id"].isin(other_langs["entry_id"]) 
              & df["offer_ref"].isin(other_langs["offer_ref"]))]

In [8]:
# "review_text" nan are to be deleted. Other can stay.
df = df.dropna(subset=["review_text"])

# Remove invalid entries
df = df.loc[df["entry_date"] != "entry_date"]

# Remove duplicate entries
df = df.drop_duplicates(["offer_ref", "entry_id", "review_text"])

# fix data types
df["entry_date"] = pd.to_datetime(df["entry_date"])
df["purchase_date"] = pd.to_datetime(df["purchase_date"])
df["entry_id"] = df["entry_id"].astype(int)
df["offer_ref"] = df["offer_ref"].astype(int)
df["score"] = df["score"].astype(float)

# Get Sentiment Cases based on score
df["sentiment"] = df["score"].apply(lambda x: "Positive" if x >= 4 else "Negative" if x <= 2 else "Neutral")

In [9]:
# Get only text and sentiment
to_clean = df[["review_text", "sentiment"]].drop_duplicates("review_text").copy()

In [10]:
# 1927 reviews containing special unicode characters
to_clean[to_clean["review_text"].str.contains("[\t\r\n\v\f\ufeff]")].shape

(4877, 2)

In [11]:
# Get rid of newlines and other whitespace chars
to_clean["review_text"] = to_clean["review_text"].replace("[\t\r\n\v\f\ufeff]", " ", regex=True)

In [12]:
# Remove whitespaces created by previous step
to_clean["review_text"] = to_clean["review_text"].replace(" +", " ", regex=True)
to_clean[to_clean["review_text"].str.contains("  ")]

Unnamed: 0,review_text,sentiment


In [13]:
# Remove cyrillic characters. Sometimes there are reviews half written in cyrillic and half in polish, so I will keep polish characters.
cyrylic_regex = f"[ {string.punctuation}0-9]*[\u0400-\u04FF]+[ {string.punctuation}0-9]*[\u0400-\u04FF]+[ {string.punctuation}0-9]*"
to_clean["review_text"] = to_clean["review_text"].transform(lambda x: re.sub(cyrylic_regex, "", x)).replace("", np.nan)
to_clean = to_clean.dropna(subset=["review_text"])

In [14]:
# model_ckpt = "papluca/xlm-roberta-base-language-detection"
# pipe = pipeline("text-classification", model=model_ckpt)
# language_classification = pipe(to_clean["review_text"].transform(lambda x: x[:100]).to_list(), top_k=1, truncation=True)
# to_clean = pd.concat([to_clean.reset_index(), pd.DataFrame(np.array(language_classification).reshape(-1).tolist())], axis=1).set_index("index")

In [15]:
# # Lot of Wrong predictions. I will fix them manually
# to_clean[(to_clean["label"] != "pl")].iloc[2400:]

In [16]:
# Mark all emojis in data. All_emoji will be excluded entirely, and has_emoji will remove only emojis.
to_clean["has_emoji"] = to_clean["review_text"].transform(lambda x: np.any([emoji.is_emoji(c) for c in x]))
to_clean["all_emoji"] = to_clean["review_text"].transform(lambda x: np.all([emoji.is_emoji(c) for c in x]))

In [17]:
# 460 Reviews containing emojis
print(len(to_clean[to_clean["has_emoji"]]))
# 42 reviews containing only emojis
print(len(to_clean[to_clean["all_emoji"]]))

676
55


In [18]:
# You can sort of guess the sentiment of the review based on the emojis. But it's not consistent.
to_clean[to_clean["all_emoji"]].head(10)

Unnamed: 0,review_text,sentiment,has_emoji,all_emoji
0,😑😑,Negative,True,True
1091,😐,Negative,True,True
9421,👍,Neutral,True,True
10053,👍👍👍👍👍,Positive,True,True
10958,🤗,Positive,True,True
11179,🥳🎉,Positive,True,True
13728,👍🏻👍🏻,Negative,True,True
14711,👍👍,Neutral,True,True
24204,🙂,Positive,True,True
25324,😫,Negative,True,True


In [19]:
# Remove all reviews containing single repeating characters that are non-emoji
to_clean = to_clean.loc[~(to_clean["review_text"].transform(lambda x: len(set(x)) == 1) & ~to_clean["all_emoji"])]

In [20]:
# remove reviews with only 3 unique characters that are not meaninful. Decided by hand.
# Meaningfull list contains different versions of emoticons and ok and kk and bdb and x... and 5/5 etc.
to_clean = to_clean[
    ~(to_clean["review_text"].transform(lambda x: len(set(x)) < 3) 
    & ~to_clean["has_emoji"] 
    & ~to_clean["review_text"].str.lower().str.contains("(ok|[0-9]|:[\)\(]|;[\)\()\/]|\*|bdb|kk|x)"))
]

  & ~to_clean["review_text"].str.lower().str.contains("(ok|[0-9]|:[\)\(]|;[\)\()\/]|\*|bdb|kk|x)"))


In [58]:
(to_clean.loc[
        to_clean["review_text"].transform(lambda x: len(re.findall(r"\bok\b.? [0-9]+", x))) > 1,
        "review_text"]
)

146       Odradzam ze zwględu na funkcjonalność. Bardzo ...
428       Wykonanie bardo dobre. Co do baterii, tu trzeb...
804        Konto Nie polecam Ocena: 1/5 Wystawiono 2 mie...
1085      Jedyna zaleta to wysokość. . Odkurzanie niedok...
1798      [...] A teraz krótko o baterii: przestrzegam i...
                                ...                        
140820    Omron 3 (zakupiony w maju 2013 r.; w aptece) b...
142821    Po pomalowaniu (na bardzo jasnej ścianie) i wy...
159334    Kocioł użytkuję od 2 lat. Trzeba dobrze wyregu...
166906    Zapach podoba się każdemu :) Osobiscie używam ...
172943    Telefon posiadałem 2 miesiące i z przykrością ...
Name: review_text, Length: 73, dtype: object

In [56]:
# Function discriminates between ok meaning good and ok meaning circa 
# which is a short form of około in polish language.
# Ok has it's own fix because it is very common in reviews and gives positive sentiment.
def fix_ok_in_string(text):
    circa_match = re.search(r"\bok\b.? [0-9]+", text.lower())
    if circa_match is not None:
        circa_match = circa_match.span()
        return re.sub(r"\b(ok|Ok|OK|oK)\b\.?( [^0-9])", r"Ok\g<2>", text)
    else:
        return re.sub(r"\b(ok|Ok|OK|oK)\b", "Ok", text)

In [57]:
test_text = (to_clean.loc[
        to_clean["review_text"].transform(lambda x: len(re.findall(r"\bok\b.? [0-9]+", x))) > 2,
        "review_text"]
).iloc[0]

test_text = test_text[:12] + "ok " + test_text[12:-10] + "ok " + test_text[-10:]

fix_ok_in_string(test_text)

'Oczyszczacz Ok kupiony w RTVEuroAGD za 299pln miał być "okazją stulecia". Na stronie polskiego importera widniała cena ponad 1000pln !!!!! Po zakupie wyszło to: - lampa UV ma żywotność 600h, co kwalifikuje ją do wymiany co ok. 2 m-ce; koszt ok. 40zł plus transport (swoją drogą ciężko ocenić jej działanie; instrukcja zabrania też używanie oczyszczacza bez lub z niedziałającą lampą) - filtr ściemniał i pomimo czyszczenia już się praktycznie nie nadaje do użytku po 13 miesiącach - ma dziwny zapach, więc pewnie powinien być wymieniony nieco wcześniej; koszt ok. 70pln plus pewnie jakiś transport - denerwowało mnie również, że oczyszczacz stojąc na podłodze i w trybie nadmuchu "2" lub "3" ODGANIA OD SIEBIE brud zamiast go wciągać !!!! Sprzedałem niedawno na licytacji Allegro za 77zł (słabe zainteresowanie). Rok korzystania za ok. 280pln. Może opłacałoby się dalej inwestować w filtry i lampy UV, ale doszedłem do wniosku, że ten sprzęt jest po prostu słaby. Szybko nie zdecyduję się na nic Ok 

In [59]:
# Mark Ok's in data.
to_clean["has_ok"] = to_clean["review_text"].str.lower().str.contains("\Wok\W")
# to_clean[to_clean["has_ok"]] = to_clean[to_clean["has_ok"]].replace(r"\bok\b", "Ok", regex=True)

In [15]:
fix_ok_in_string("ok. lalala ok. 100kg ok!")

'Ok. lalala ok. 100kg Ok!'

In [61]:
# Fix all ok's in data.
to_clean["review_text"] = to_clean["review_text"].transform(fix_ok_in_string)

In [17]:
# Not Currently Used
# Remove all numbers from data.
# to_clean["review_text"] = to_clean["review_text"].transform(lambda x: re.sub("[0-9]+", "", x))

In [62]:
# Largest default polish language model
nlp = spacy.load("pl_core_news_lg")

# Autocorrect speller
spell = Speller("pl", only_replacements=True)

# Enchant spellcheckers
chkr = SpellChecker("pl_Pl") 
d_typo = enchant.Dict("pl_PL")

In [69]:
# During cleaning new duplicate reviews were created.
to_clean = to_clean.drop_duplicates()

In [70]:
to_clean["review_text"] = to_clean["review_text"].transform(lambda x: nlp(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_clean["review_text"] = to_clean["review_text"].transform(lambda x: nlp(x))


In [93]:
to_clean["Typo Data"] = (
    to_clean["review_text"].transform(lambda x: 
        [[token.text, 
          d_typo.suggest(token.text), 
          i, 
          str(x).find(token.text), 
          len(token.text),
          ] 
        for i, token in enumerate(x) 
            if not token.is_punct
               and not token.is_space
               and not token.like_num
               and (not d_typo.check(token.text))
               and (not d_typo.check(token.text.lower())) 
               and (not d_typo.check(token.text.capitalize())) 
               and (emoji.is_emoji(token.text) == False)])
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_clean["Typo Data"] = (


In [94]:
to_clean.to_pickle("./data/typos_annotated.pkl")

In [None]:
# Will run 3 mins

# Get subset of data.
# checker_test = to_clean["review_text"].iloc[:2000].transform(lambda x: nlp(x))

# 42% of first 2000 reviews contain typos
# pyenchany seems good at detecting typos however it seems bad at fixing typos So we'll have to be careful with it.
# checker_test[checker_test.transform(lambda x: all([d_typo.check(token.text.lower()) for token in x if not token.is_punct])) == False]

In [20]:
## Takes 44 mins.
## Fix all typos. Write 2 csv files with and without type fixing. Used for comparison in another script.
# to_clean["review_text"].transform(lambda x: [d_typo.suggest(token.text)[0] if (not token.is_punct) and (not d_typo.check(token.text)) and (chkr.suggest(token.text))  else token for token in x]).to_csv("Testing Typo Checking.csv")
# to_clean["review_text"].to_csv("Comparison to a Typo fix.csv")

In [21]:
# I've forgotten about sentiment.
# to_clean["sentiment"].to_csv("sentiment provision.csv")