# Pre-processing Data

In [1]:
# =======================================
# Pre-Processing of all Datasets
# =======================================

In [2]:
# 1. Package Installation
# -----------------------
# Install of pandas to handle and manipulate structured data
# Install langdetect to automatically detect the language of a given text.
# Install nltk for a wide array of NLP tasks like tokenization, stopword removal, stemming etc.
!pip install pandas langdetect nltk spacy scikit-learn
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.0/12.8 MB 495.5 kB/s eta 0:00:26
     - -------------------------------------- 0.4/12.8 MB 2.9 MB/s eta 0:00:05
     -- ------------------------------------- 0.9/12.8 MB 5.2 MB/s eta 0:00:03
     ----- ---------------------------------- 1.7/12.8 MB 7.9 MB/s eta 0:00:02
     ------- -------------------------------- 2.5/12.8 MB 9.3 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 10.0 MB/s eta 0:00:01
     ----------- ---------------------------- 3.8/12.8 MB 10.6 MB/s eta 0:00:01
     -------------- ------------------------- 4.6/12.8 MB 11.3 MB/s eta 0:00:01
     ----------------- -------------

In [3]:
# === Install Required Libraries (for notebook environments) ===
import pandas as pd
import re
import nltk
import spacy
from langdetect import detect
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer


In [4]:
# Download NLTK stopwords
nltk.download('stopwords')
nltk.download('wordnet')

# Load SpaCy's English model for lemmatization
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nklom\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nklom\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Extra Knab reviews added

In [6]:
# === Load Data ===
# Load pre-collected reviews from various fintech apps 
df_Merged_extra_Reddit_Reviews = pd.read_excel("C:/Users/.../Merged_extra_Reddit_Reviews.xlsx")  

In [7]:
display(df_Merged_extra_Reddit_Reviews)

Unnamed: 0,post_title,comment,score,author,permalink,app_name
0,The credit card: when you can only pay with this,"N26, digital and free. You dump the purchase ...",5,b4ttleduck,https://www.reddit.com/r/nederlands/comments/...,N26
1,The credit card: when you can only pay with this,"Take a debit card via N26, for example.",6,Diederik-NL,https://www.reddit.com/r/nederlands/comments/...,N26
2,The credit card: when you can only pay with this,Very often when you can choose Mastercsrd/Visa...,1,TheHazardOfLife,https://www.reddit.com/r/nederlands/comments/...,N26
3,The credit card: when you can only pay with this,Revolut and N26 offer free debit cards which I...,1,Interesting-Loan507,https://www.reddit.com/r/nederlands/comments/...,N26
4,The credit card: when you can only pay with this,Free account at N26 or Revolut?,1,Square_Dimension6763,https://www.reddit.com/r/nederlands/comments/...,N26
...,...,...,...,...,...,...
1467,Best bank for joint accounts?,I have Knab. Very affordable. You only pay fo...,1,ZoneProfessional8202,https://www.reddit.com/r/nederlands/comments/...,boy
1468,Best bank for joint accounts?,"would stick with one of the 4 big banks, then...",1,chartnoob,https://www.reddit.com/r/nederlands/comments/...,boy
1469,Best bank for joint accounts?,"We also went for Knab, excellent service and ...",1,Velum_In_1716,https://www.reddit.com/r/nederlands/comments/...,boy
1470,Switching banks,"I have had quite good experiences with knab, ...",2,MikeWazowski2-2-2,https://www.reddit.com/r/nederlands/comments/...,boy


In [8]:
print(df_Merged_extra_Reddit_Reviews.columns)


Index(['post_title', ' comment', ' score', ' author', ' permalink',
       ' app_name'],
      dtype='object')


In [9]:
# === Language Filtering ===
### 1) Remove Non-English Reviews and Filter Short Reviews ###
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False  

df_Merged_extra_Reddit_Reviews = df_Merged_extra_Reddit_Reviews[df_Merged_extra_Reddit_Reviews[" comment"].apply(is_english)]  # Keep only English reviews
df_Merged_extra_Reddit_Reviews = df_Merged_extra_Reddit_Reviews[df_Merged_extra_Reddit_Reviews[" comment"].str.split().apply(len) >= 10]  # Keep reviews with ≥10 words

In [10]:
# Define words to remove
words_to_remove = ["bunq", "n26", "revolut", "google", "mollie", "ing", "abn amro", "netherlands", "knab"]
pattern = r'\b(?:' + '|'.join(words_to_remove) + r')\b'

# Remove target words (case-insensitive)
df_Merged_extra_Reddit_Reviews[" comment"] = df_Merged_extra_Reddit_Reviews[" comment"].str.replace(pattern, '', case=False, regex=True)

# Removing extra spaces
df_Merged_extra_Reddit_Reviews[" comment"] = df_Merged_extra_Reddit_Reviews[" comment"].str.replace(r"\s+", " ", regex=True).str.strip()

### 2) Convert to Lowercase
df_Merged_extra_Reddit_Reviews[" comment"] = df_Merged_extra_Reddit_Reviews[" comment"].str.lower()



In [11]:
### 3) Removing Punctuation, Extra Spaces, Numbers, Emojis, and Special Characters ###
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII (e.g. emojis)
    return text

df_Merged_extra_Reddit_Reviews[" comment"] = df_Merged_extra_Reddit_Reviews[" comment"].apply(clean_text)



In [12]:
### 4) Removing Common Stop Words
stop_words = set(stopwords.words("english"))
df_Merged_extra_Reddit_Reviews[" comment"] = df_Merged_extra_Reddit_Reviews[" comment"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

In [13]:
### 5) Tokenization 
df_Merged_extra_Reddit_Reviews["review_tokens"] = df_Merged_extra_Reddit_Reviews[" comment"].apply(
    lambda x: [token.text for token in nlp(x)]
)


In [14]:
### 6) Lemmatization
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.lemma_ not in stop_words])

df_Merged_extra_Reddit_Reviews[" comment"] = df_Merged_extra_Reddit_Reviews[" comment"].apply(lemmatize_text)

# Display final preprocessed DataFrame
display(df_Merged_extra_Reddit_Reviews)

Unnamed: 0,post_title,comment,score,author,permalink,app_name,review_tokens
0,The credit card: when you can only pay with this,digital free dump purchase amount card pay,5,b4ttleduck,https://www.reddit.com/r/nederlands/comments/...,N26,"[digital, free, dump, purchase, amount, card, ..."
2,The credit card: when you can only pay with this,often choose mastercsrdvisa payment screen mat...,1,TheHazardOfLife,https://www.reddit.com/r/nederlands/comments/...,N26,"[often, choose, mastercsrdvisa, payment, scree..."
3,The credit card: when you can only pay with this,offer free debit card actually use possible in...,1,Interesting-Loan507,https://www.reddit.com/r/nederlands/comments/...,N26,"[offer, free, debit, cards, actually, use, pos..."
6,The credit card: when you can only pay with this,free bank account germany free debit mastercar...,1,Langkampo,https://www.reddit.com/r/nederlands/comments/...,N26,"[free, bank, account, germany, free, debit, ma..."
7,The credit card: when you can only pay with this,account german bank apply virtual credit card ...,1,Fallaquenta,https://www.reddit.com/r/nederlands/comments/...,N26,"[account, german, bank, apply, virtual, credit..."
...,...,...,...,...,...,...,...
1466,Can you open two student accounts?,pay fix amount unlimited number account card,1,ZoneProfessional8202,https://www.reddit.com/r/nederlands/comments/...,boy,"[pay, fixed, amount, unlimited, number, accoun..."
1467,Best bank for joint accounts?,affordable pay check account also open many ch...,1,ZoneProfessional8202,https://www.reddit.com/r/nederlands/comments/...,boy,"[affordable, pay, checking, account, also, ope..."
1468,Best bank for joint accounts?,would stick one big bank always good rabo sns ...,1,chartnoob,https://www.reddit.com/r/nederlands/comments/...,boy,"[would, stick, one, big, banks, always, good, ..."
1469,Best bank for joint accounts?,also go excellent service good interest rate,1,Velum_In_1716,https://www.reddit.com/r/nederlands/comments/...,boy,"[also, went, excellent, service, good, interes..."


In [15]:
df_knab_reviews = pd.read_excel(r'C:/Users/.../knab_GooglePlay_reviews.xlsx')

In [16]:
# Load English language model for spaCy
nlp = spacy.load("en_core_web_sm")

### 1) Remove Non-English Reviews and Filter Short Reviews ###
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False  # Handle errors for very short texts

# Keep only English reviews
df_knab_reviews = df_knab_reviews[
    df_knab_reviews["review_translated"].apply(is_english)
]

# Keep reviews with ≥15 words
df_knab_reviews = df_knab_reviews[
    df_knab_reviews["review_translated"].str.split().apply(len) >= 15
]

# Define words to remove
words_to_remove = ["bunq", "n26", "revolut", "google", "mollie", "ing", "abn amro", "netherlands", "knab"]
pattern = r'\b(?:' + '|'.join(words_to_remove) + r')\b'

# Remove target words (case-insensitive)
df_knab_reviews["review_translated"] = df_knab_reviews["review_translated"].str.replace(pattern, '', case=False, regex=True)

# Remove extra spaces
df_knab_reviews["review_translated"] = df_knab_reviews["review_translated"].str.replace(r"\s+", " ", regex=True).str.strip()



In [17]:
### 2) Convert to Lowercase 
df_knab_reviews["review_translated"] = df_knab_reviews["review_translated"].str.lower()

### 3) Remove Punctuation, Extra Spaces, Numbers, Emojis, and Special Characters ###
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII (e.g., emojis)
    return text

df_knab_reviews["review_translated"] = df_knab_reviews["review_translated"].apply(clean_text)

### 4) Remove Common Stop Words 
stop_words = set(stopwords.words("english"))
df_knab_reviews["review_translated"] = df_knab_reviews["review_translated"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)


In [18]:
### 5) Tokenization 
df_knab_reviews["review_tokens"] = df_knab_reviews["review_translated"].apply(
    lambda x: [token.text for token in nlp(x)]
)


In [19]:
### 6) Lemmatization
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.lemma_ not in stop_words])

df_knab_reviews["review_translated"] = df_knab_reviews["review_translated"].apply(lemmatize_text)

# Display final preprocessed DataFrame
display(df_knab_reviews)

Unnamed: 0,reviewId,UserName,userImage,review_translated,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,review_tokens
0,49a7182b-4309-42b0-a154-c3b93ccc523a,Aad Dis,https://play-lh.googleusercontent.com/a/ACg8oc...,app support contactless payment must go pay ra...,1,1,3.67.0,2025-04-16 17:48:06,Our app supports contactless payments exclusiv...,2025-04-17 09:58:10,3.67.0,"[app, does, nt, support, contactless, payment,..."
1,258aec90-93bc-4a79-bf05-9435b276ab64,Joey hard,https://play-lh.googleusercontent.com/a-/ALV-U...,recommend many update hassle little possible w...,1,0,3.67.0,2025-04-04 11:04:10,It is understandable that this is frustrating....,2025-04-07 08:35:24,3.67.0,"[recommend, many, updates, hassle, little, pos..."
3,9a8e1929-7871-4871-97e9-d888531f2d50,Finance CV,https://play-lh.googleusercontent.com/a-/ALV-U...,bad acceptance dutch legal form would make wel...,4,0,3.67.0,2025-04-04 01:36:52,"Good morning, we are aware of the desire to ac...",2025-04-07 08:30:59,3.67.0,"[bad, acceptance, dutch, legal, forms, would, ..."
5,7a8bb717-f34a-4e26-b34e-57d84ac6352f,Thea Boelens,https://play-lh.googleusercontent.com/a/ACg8oc...,customer aegon transfer scan everything still get,2,0,3.67.0,2025-03-29 11:33:19,"Good afternoon, former Aegon Bank customers ha...",2025-04-01 17:51:54,3.67.0,"[customer, aegon, transferred, scanned, everyt..."
8,12a043ec-5e2b-44e2-a3f1-665c00cef89b,emre Akkus,https://play-lh.googleusercontent.com/a/ACg8oc...,work every weekend saturday night sunday night...,1,0,3.67.0,2025-03-23 04:00:53,"Good afternoon, maintenance is very important ...",2025-04-01 13:45:05,3.67.0,"[work, every, weekend, saturday, night, sunday..."
...,...,...,...,...,...,...,...,...,...,...,...,...
6219,2ab6d308-500d-425d-a7d6-70617b94fdeb,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,installing say save plenty room app instal wit...,2,1,1.0.5.1,2014-06-17 21:18:26,,NaT,1.0.5.1,"[installing, says, ca, nt, save, plenty, room,..."
6221,2acaf0c7-1ff1-4f21-bfad-b4fb3b6b53a4,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,install fail install usb storage sd card app p...,2,1,1.0.5.1,2014-06-13 09:12:43,,NaT,1.0.5.1,"[ca, nt, install, failed, install, usb, storag..."
6224,7a863c99-d3d6-49fb-8b78-7a8a85ad0869,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,good reliable plenty function overload app acc...,5,0,1.0.5,2014-04-27 21:00:36,,NaT,1.0.5,"[good, reliable, plenty, functions, overloaded..."
6226,cb1aee6f-cc86-492d-b040-eb9be281edef,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,need phone connection active work wifi mode st...,1,1,1.0.3,2014-04-14 16:34:08,,NaT,1.0.3,"[need, phone, connection, active, does, nt, wo..."


Reddit

In [20]:
df_Reddit = pd.read_excel(r'C:/Users/.../Merged_Reddit_Reviews.xlsx')

In [21]:
### 1) Remove Non-English Reviews and Filter Short Reviews ###
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False  # Handle errors for very short texts

df_Reddit = df_Reddit[df_Reddit["review_translated"].apply(is_english)]  # Keep only English reviews
df_Reddit = df_Reddit[df_Reddit["review_translated"].str.split().apply(len) >= 10]  # Keep reviews with ≥10 words

In [22]:
display(df_Reddit)

Unnamed: 0,post_title,comment,review_translated,score,author,permalink,app_name
0,When catastrophe strikes: iDeal & PayPal not s...,There are some online credit cards that are pe...,There are some online credit cards that are pe...,2,Pacpav,https://www.reddit.com/r/Netherlands/comments/...,N26
1,When catastrophe strikes: iDeal & PayPal not s...,"I use N26, they offer a free debit card from M...","use N26, they offer a free debit card from Mas...",3,faabmaster,https://www.reddit.com/r/Netherlands/comments/...,N26
5,When catastrophe strikes: iDeal & PayPal not s...,"Either get a credit card from your bank, or si...","Either get a credit card from your bank, or si...",92,DaHaunter,https://www.reddit.com/r/Netherlands/comments/...,N26
6,When catastrophe strikes: iDeal & PayPal not s...,"i use n26 myself, never really had an issue so...","use n26 myself, never really had an issue so far",2,Useful-Position-4445,https://www.reddit.com/r/Netherlands/comments/...,N26
7,When catastrophe strikes: iDeal & PayPal not s...,I use N26. Only issue I have is that transferr...,use N26. Only issue I have is that transferrin...,2,MicrochippedByGates,https://www.reddit.com/r/Netherlands/comments/...,N26
...,...,...,...,...,...,...,...
7276,Transferring funds to America,Apps like transferwise generally let you save ...,Apps like transferwise generally let you save ...,9,NinjaInUnitard,https://www.reddit.com/r/Netherlands/comments/...,Revolut
7277,Transferring funds to America,Obviously you can transfer funds from NL to US...,Obviously you can transfer funds from NL to US...,3,rex-ac,https://www.reddit.com/r/Netherlands/comments/...,Revolut
7278,Transferring funds to America,I use US Forex to transfer funds from the US t...,use US Forex to transfer funds from the US to ...,2,aliblue225,https://www.reddit.com/r/Netherlands/comments/...,Revolut
7279,Transferring funds to America,I’ve used transferwise for the reverse directi...,I’ve used transferwise for the reverse directi...,2,purple_pandaface,https://www.reddit.com/r/Netherlands/comments/...,Revolut


In [23]:
# Define words to remove
words_to_remove = ["bunq", "n26", "revolut", "google", "mollie", "ing", "abn amro", "netherlands"]

# Create regex pattern for word boundaries and join with |
pattern = r'\b(?:' + '|'.join(words_to_remove) + r')\b'

# Remove the words (case-insensitive)
df_Reddit["review_translated"] = df_Reddit["review_translated"].str.replace(pattern, '', case=False, regex=True)

# Optional: Remove extra spaces after removal
df_Reddit["review_translated"] = df_Reddit["review_translated"].str.replace(r"\s+", " ", regex=True).str.strip()


In [24]:
### 2) Convert to Lowercase 
df_Reddit["review_translated"] = df_Reddit["review_translated"].str.lower()

In [25]:
display(df_Reddit)

Unnamed: 0,post_title,comment,review_translated,score,author,permalink,app_name
0,When catastrophe strikes: iDeal & PayPal not s...,There are some online credit cards that are pe...,there are some online credit cards that are pe...,2,Pacpav,https://www.reddit.com/r/Netherlands/comments/...,N26
1,When catastrophe strikes: iDeal & PayPal not s...,"I use N26, they offer a free debit card from M...","use , they offer a free debit card from master...",3,faabmaster,https://www.reddit.com/r/Netherlands/comments/...,N26
5,When catastrophe strikes: iDeal & PayPal not s...,"Either get a credit card from your bank, or si...","either get a credit card from your bank, or si...",92,DaHaunter,https://www.reddit.com/r/Netherlands/comments/...,N26
6,When catastrophe strikes: iDeal & PayPal not s...,"i use n26 myself, never really had an issue so...","use myself, never really had an issue so far",2,Useful-Position-4445,https://www.reddit.com/r/Netherlands/comments/...,N26
7,When catastrophe strikes: iDeal & PayPal not s...,I use N26. Only issue I have is that transferr...,use . only issue i have is that transferring m...,2,MicrochippedByGates,https://www.reddit.com/r/Netherlands/comments/...,N26
...,...,...,...,...,...,...,...
7276,Transferring funds to America,Apps like transferwise generally let you save ...,apps like transferwise generally let you save ...,9,NinjaInUnitard,https://www.reddit.com/r/Netherlands/comments/...,Revolut
7277,Transferring funds to America,Obviously you can transfer funds from NL to US...,obviously you can transfer funds from nl to us...,3,rex-ac,https://www.reddit.com/r/Netherlands/comments/...,Revolut
7278,Transferring funds to America,I use US Forex to transfer funds from the US t...,use us forex to transfer funds from the us to ...,2,aliblue225,https://www.reddit.com/r/Netherlands/comments/...,Revolut
7279,Transferring funds to America,I’ve used transferwise for the reverse directi...,i’ve used transferwise for the reverse directi...,2,purple_pandaface,https://www.reddit.com/r/Netherlands/comments/...,Revolut


In [26]:
### 3) Remove Punctuation, Extra Spaces, Numbers, Emojis, and Special Characters ###
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII (emojis, 
    return text

df_Reddit["review_translated"] = df_Reddit["review_translated"].apply(clean_text)

In [27]:
### 4) Remove Common Stop Words 
stop_words = set(stopwords.words("english"))
df_Reddit["review_translated"] = df_Reddit["review_translated"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

In [28]:
### 7) Tokenization 
df_Reddit["review_tokens"] = df_Reddit["comment"].apply(lambda x: [token.text for token in nlp(x)])

In [29]:
### 6) Lemmatization 
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.lemma_ not in stop_words])

df_Reddit["review_translated"] = df_Reddit["review_translated"].apply(lemmatize_text)

In [30]:
display(df_Reddit)

Unnamed: 0,post_title,comment,review_translated,score,author,permalink,app_name,review_tokens
0,When catastrophe strikes: iDeal & PayPal not s...,There are some online credit cards that are pe...,online credit card perfect type infrequent pur...,2,Pacpav,https://www.reddit.com/r/Netherlands/comments/...,N26,"[There, are, some, online, credit, cards, that..."
1,When catastrophe strikes: iDeal & PayPal not s...,"I use N26, they offer a free debit card from M...",use offer free debit card mastercard nearly be...,3,faabmaster,https://www.reddit.com/r/Netherlands/comments/...,N26,"[I, use, N26, ,, they, offer, a, free, debit, ..."
5,When catastrophe strikes: iDeal & PayPal not s...,"Either get a credit card from your bank, or si...",either get credit card bank sign free transfer...,92,DaHaunter,https://www.reddit.com/r/Netherlands/comments/...,N26,"[Either, get, a, credit, card, from, your, ban..."
6,When catastrophe strikes: iDeal & PayPal not s...,"i use n26 myself, never really had an issue so...",use never really issue far,2,Useful-Position-4445,https://www.reddit.com/r/Netherlands/comments/...,N26,"[i, use, n26, myself, ,, never, really, had, a..."
7,When catastrophe strikes: iDeal & PayPal not s...,I use N26. Only issue I have is that transferr...,use issue transfer money take bit time day wou...,2,MicrochippedByGates,https://www.reddit.com/r/Netherlands/comments/...,N26,"[I, use, N26, ., Only, issue, I, have, is, tha..."
...,...,...,...,...,...,...,...,...
7276,Transferring funds to America,Apps like transferwise generally let you save ...,app like transferwise generally let save inter...,9,NinjaInUnitard,https://www.reddit.com/r/Netherlands/comments/...,Revolut,"[Apps, like, transferwise, generally, let, you..."
7277,Transferring funds to America,Obviously you can transfer funds from NL to US...,obviously transfer fund nl guess want know fas...,3,rex-ac,https://www.reddit.com/r/Netherlands/comments/...,Revolut,"[Obviously, you, can, transfer, funds, from, N..."
7278,Transferring funds to America,I use US Forex to transfer funds from the US t...,use forex transfer fund check see work opposit...,2,aliblue225,https://www.reddit.com/r/Netherlands/comments/...,Revolut,"[I, use, US, Forex, to, transfer, funds, from,..."
7279,Transferring funds to America,I’ve used transferwise for the reverse directi...,I use transferwise reverse direction cheap use...,2,purple_pandaface,https://www.reddit.com/r/Netherlands/comments/...,Revolut,"[I, ’ve, used, transferwise, for, the, reverse..."


App Store

In [31]:
df_AppStore = pd.read_excel(r'C:/Users/.../Merged_AppStore_Reviews.xlsx')

In [32]:
### 1) Remove Non-English Reviews and Filter Short Reviews ###
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False  # Handle errors for very short texts

df_AppStore = df_AppStore[df_AppStore["review_translated"].apply(is_english)]  # Keep only English reviews
df_AppStore = df_AppStore[df_AppStore["review_translated"].str.split().apply(len) >= 10]  # Keep reviews with ≥30 words

In [33]:
display(df_AppStore)

Unnamed: 0,date,userName,review,review_translated,rating,developerResponse,app_name
0,2020-04-15 13:31:45,olan.acc,valeriol7860 de code om 15 euro op de kaart te...,valeriol7860 the code to get 15 euros on the c...,5,,N26
2,2020-09-11 14:06:24,Virgie1234,Eigenlijk wilde ik N26 gebruiken als vaste las...,Actually I wanted to use N26 as a fixed costs ...,3,"{'id': 21541639, 'body': ""Hey, thank you for y...",N26
3,2018-02-19 19:45:25,meeljeme,Of you guys want to take off here in The Nethe...,Of you guys want to take off here in The Nethe...,4,,N26
4,2018-05-15 18:01:27,Rumbl3Shot,Help me! I want to verify my account but when ...,Help me! I want to verify my account but when ...,1,"{'id': 3541420, 'body': ""Hey! We're sorry abou...",N26
5,2023-02-07 03:57:37,DVA013,N26 and Revolut are the reason why more and mo...,N26 and Revolut are the reason why more and mo...,1,"{'id': 34968254, 'body': ""Hey! We'd love to le...",N26
...,...,...,...,...,...,...,...
2550,2020-02-11 21:00:56,user20120,Na update van vandaag. Kan ik niet neer inlog...,After today's update. I can't log in. Now I'm ...,1,,Revolut
2551,2020-02-08 17:59:13,vlad.82.82.82,Account locked 2 month ago. Any information wh...,Account locked 2 months ago. Any information w...,1,,Revolut
2552,2020-02-07 14:13:59,Unnamed Resource,"Fix this, there are no other ways to add money...","Fix this, there are no other ways to add money...",2,,Revolut
2553,2020-01-29 12:35:37,joelielie,The app has a better lay-out than N26 to be ho...,The app has a better layout than N26 to be hon...,5,,Revolut


In [34]:
# Define words to remove
words_to_remove = ["bunq", "n26", "revolut", "google", "mollie", "ing", "abn amro", "netherlands"]
# Create regex pattern for word boundaries and join with |
pattern = r'\b(?:' + '|'.join(words_to_remove) + r')\b'

# Remove the words (case-insensitive)
df_AppStore["review_translated"] = df_AppStore["review_translated"].str.replace(pattern, '', case=False, regex=True)

# Optional: Remove extra spaces after removal
df_AppStore["review_translated"] = df_AppStore["review_translated"].str.replace(r"\s+", " ", regex=True).str.strip()

In [35]:
### 2) Convert to Lowercase 
df_AppStore["review_translated"] = df_AppStore["review_translated"].str.lower()

In [36]:
### 3) Remove Punctuation, Extra Spaces, Numbers, Emojis, and Special Characters ###
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII (emojis, symbols)
    return text

df_AppStore["review_translated"] = df_AppStore["review_translated"].apply(clean_text)


In [37]:
### 4) Remove Common Stop Words ###
stop_words = set(stopwords.words("english"))
df_AppStore["review_translated"] = df_AppStore["review_translated"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

In [38]:
### 6) Tokenization 
df_AppStore["review_tokens"] = df_AppStore["review"].apply(lambda x: [token.text for token in nlp(x)])

In [39]:
### 7) Lemmatization 
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.lemma_ not in stop_words])

df_AppStore["review_translated"] = df_AppStore["review_translated"].apply(lemmatize_text)

In [40]:
display(df_AppStore)

Unnamed: 0,date,userName,review,review_translated,rating,developerResponse,app_name,review_tokens
0,2020-04-15 13:31:45,olan.acc,valeriol7860 de code om 15 euro op de kaart te...,valeriol code get euros card every transaction...,5,,N26,"[valeriol7860, de, code, om, 15, euro, op, de,..."
2,2020-09-11 14:06:24,Virgie1234,Eigenlijk wilde ik N26 gebruiken als vaste las...,actually want use fix cost account mortgage pr...,3,"{'id': 21541639, 'body': ""Hey, thank you for y...",N26,"[Eigenlijk, wilde, ik, N26, gebruiken, als, va..."
3,2018-02-19 19:45:25,meeljeme,Of you guys want to take off here in The Nethe...,guy want take need support dutch ideal payment...,4,,N26,"[Of, you, guys, want, to, take, off, here, in,..."
4,2018-05-15 18:01:27,Rumbl3Shot,Help me! I want to verify my account but when ...,help want verify account open camera snap card...,1,"{'id': 3541420, 'body': ""Hey! We're sorry abou...",N26,"[Help, me, !, I, want, to, verify, my, account..."
5,2023-02-07 03:57:37,DVA013,N26 and Revolut are the reason why more and mo...,reason people choose decentralized finance ban...,1,"{'id': 34968254, 'body': ""Hey! We'd love to le...",N26,"[N26, and, Revolut, are, the, reason, why, mor..."
...,...,...,...,...,...,...,...,...
2550,2020-02-11 21:00:56,user20120,Na update van vandaag. Kan ik niet neer inlog...,todays update log I force set access code phon...,1,,Revolut,"[Na, update, van, vandaag, ., , Kan, ik, niet..."
2551,2020-02-08 17:59:13,vlad.82.82.82,Account locked 2 month ago. Any information wh...,account lock month ago information unlocked su...,1,,Revolut,"[Account, locked, 2, month, ago, ., Any, infor..."
2552,2020-02-07 14:13:59,Unnamed Resource,"Fix this, there are no other ways to add money...",fix way add money credit card,2,,Revolut,"[Fix, this, ,, there, are, no, other, ways, to..."
2553,2020-01-29 12:35:37,joelielie,The app has a better lay-out than N26 to be ho...,app well layout honest customer service top no...,5,,Revolut,"[The, app, has, a, better, lay, -, out, than, ..."


Google play

In [80]:
#Importing file
df = pd.read_excel(r'C:/Users/.../Merged_GooglePlay_Review_cleaned.xlsx')

In [81]:
display(df)

Unnamed: 0,userName,content,review_translated,score,at,app_name,language
0,antonius oosterwaal,hello my n 26 always worked great but now it w...,Hello my N 26 always worked great but now it w...,1,2025-03-13 19:48:08,N26,nl
1,jan uit de bulten,"zeer slecht, in geval van fraude willen ze nie...","Very bad, in case of fraud they do not want to...",1,2025-02-28 07:07:42,N26,nl
2,ryudo 300 (ryudo300),compleet drama om te installeren. bij het gede...,Complete drama to install. During the part of ...,1,2025-02-26 21:27:51,N26,nl
3,roger van alphen,"snel, professioneel en goedkoop. uitstekende app","Fast, professional and cheap. Excellent app",5,2025-02-20 17:14:34,N26,nl
4,sammy koleilat eldelbi,good service,Good service,5,2025-02-16 21:00:19,N26,nl
...,...,...,...,...,...,...,...
100996,Een Google-gebruiker,"De rekening is gratis, alleen een pinpas kost ...","The account is free, only a debit card costs 9...",1,2015-11-25 16:49:25,Bunq,mixed
100997,Een Google-gebruiker,Lekker snel geïnstalleerd. Eindelijk een bank ...,Nice and quick installation. Finally a bank th...,5,2015-11-25 13:42:33,Bunq,mixed
100998,Een Google-gebruiker,"super handige app, lekker overzichtelijk ook","super handy app, nice and clear too",4,2015-11-25 12:55:37,Bunq,mixed
100999,Een Google-gebruiker,Mooie start bunqers! Registratie was appeltje-...,Great start bunqers! Registration was a piece ...,4,2015-11-25 12:16:11,Bunq,mixed


In [82]:
### 1) Remove Non-English Reviews and Filter Short Reviews ###
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False  # Handle errors for very short texts

df = df[df["review_translated"].apply(is_english)]  # Keep only English reviews
df = df[df["review_translated"].str.split().apply(len) >= 10]  # Keep reviews with ≥30 words

In [83]:
display(df)

Unnamed: 0,userName,content,review_translated,score,at,app_name,language
0,antonius oosterwaal,hello my n 26 always worked great but now it w...,Hello my N 26 always worked great but now it w...,1,2025-03-13 19:48:08,N26,nl
1,jan uit de bulten,"zeer slecht, in geval van fraude willen ze nie...","Very bad, in case of fraud they do not want to...",1,2025-02-28 07:07:42,N26,nl
2,ryudo 300 (ryudo300),compleet drama om te installeren. bij het gede...,Complete drama to install. During the part of ...,1,2025-02-26 21:27:51,N26,nl
5,tijs past,"very bad, no contact possible, i sent all need...","very bad, no contact possible, i sent all need...",1,2025-02-16 14:15:35,N26,nl
7,leo kroonenburg,"new customer since january, really good so far...","New customer since January, really good so far...",5,2025-02-05 10:27:58,N26,nl
...,...,...,...,...,...,...,...
100995,Een Google-gebruiker,"Goeie uitleg, en het lijkt me zeer fijn om te ...","Good explanation, and I think it will be very ...",5,2015-11-25 19:16:25,Bunq,mixed
100996,Een Google-gebruiker,"De rekening is gratis, alleen een pinpas kost ...","The account is free, only a debit card costs 9...",1,2015-11-25 16:49:25,Bunq,mixed
100997,Een Google-gebruiker,Lekker snel geïnstalleerd. Eindelijk een bank ...,Nice and quick installation. Finally a bank th...,5,2015-11-25 13:42:33,Bunq,mixed
100999,Een Google-gebruiker,Mooie start bunqers! Registratie was appeltje-...,Great start bunqers! Registration was a piece ...,4,2015-11-25 12:16:11,Bunq,mixed


In [84]:
# Define words to remove
words_to_remove = ["bunq", "n26", "revolut", "google", "mollie", "ing", "abn amro", "netherlands"]
# Create regex pattern for word boundaries and join with |
pattern = r'\b(?:' + '|'.join(words_to_remove) + r')\b'

# Remove the words (case-insensitive)
df["review_translated"] = df["review_translated"].str.replace(pattern, '', case=False, regex=True)

# Optional: Remove extra spaces after removal
df["review_translated"] = df["review_translated"].str.replace(r"\s+", " ", regex=True).str.strip()

In [85]:
### 2) Convert to Lowercase
df["review_translated"] = df["review_translated"].str.lower()

In [86]:
### 3) Remove Punctuation, Extra Spaces, Numbers, Emojis, and Special Characters ###
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII (emojis, symbols)
    return text

df["review_translated"] = df["review_translated"].apply(clean_text)


In [87]:
### 4) Remove Common Stop Words 
stop_words = set(stopwords.words("english"))
df["review_translated"] = df["review_translated"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

In [88]:
### 6) Tokenization 
df["review_tokens"] = df["content"].apply(lambda x: [token.text for token in nlp(x)])

In [89]:
### 7) Lemmatization 
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if token.lemma_ not in stop_words])

df["review_translated"] = df["review_translated"].apply(lemmatize_text)

In [90]:
display(df)

Unnamed: 0,userName,content,review_translated,score,at,app_name,language,review_tokens
0,antonius oosterwaal,hello my n 26 always worked great but now it w...,hello n always work great open phone doiti sto...,1,2025-03-13 19:48:08,N26,nl,"[hello, my, n, 26, always, worked, great, but,..."
1,jan uit de bulten,"zeer slecht, in geval van fraude willen ze nie...",bad case fraud want help ask provide evidence ...,1,2025-02-28 07:07:42,N26,nl,"[zeer, slecht, ,, in, geval, van, fraude, will..."
2,ryudo 300 (ryudo300),compleet drama om te installeren. bij het gede...,complete drama install part take photo app kee...,1,2025-02-26 21:27:51,N26,nl,"[compleet, drama, om, te, installeren, ., bij,..."
5,tijs past,"very bad, no contact possible, i sent all need...",bad contact possible send need document never ...,1,2025-02-16 14:15:35,N26,nl,"[very, bad, ,, no, contact, possible, ,, i, se..."
7,leo kroonenburg,"new customer since january, really good so far...",new customer since january really good far sti...,5,2025-02-05 10:27:58,N26,nl,"[new, customer, since, january, ,, really, goo..."
...,...,...,...,...,...,...,...,...
100995,Een Google-gebruiker,"Goeie uitleg, en het lijkt me zeer fijn om te ...",good explanation think nice use thank ceo ali,5,2015-11-25 19:16:25,Bunq,mixed,"[Goeie, uitleg, ,, en, het, lijkt, me, zeer, f..."
100996,Een Google-gebruiker,"De rekening is gratis, alleen een pinpas kost ...",account free debit card cost euro per year cas...,1,2015-11-25 16:49:25,Bunq,mixed,"[De, rekening, is, gratis, ,, alleen, een, pin..."
100997,Een Google-gebruiker,Lekker snel geïnstalleerd. Eindelijk een bank ...,nice quick installation finally bank really pu...,5,2015-11-25 13:42:33,Bunq,mixed,"[Lekker, snel, geïnstalleerd, ., Eindelijk, ee..."
100999,Een Google-gebruiker,Mooie start bunqers! Registratie was appeltje-...,great start bunqer registration piece cake sup...,4,2015-11-25 12:16:11,Bunq,mixed,"[Mooie, start, bunqers, !, Registratie, was, a..."


# Sentiment Analysis

In [None]:
# =======================================
# Pre-Processing of all Datasets
# =======================================
# This section marks the beginning of the data cleaning and preparation pipeline.
# Pre-processing ensures raw text data is in a consistent and analyzable format for sentiment analysis.

In [91]:
pip install transformers




In [92]:
# === Install Required Libraries (for notebook environments) ===
# These are the core Python libraries imported for data preprocessing and sentiment analysis.
# Load necessary modules for sentiment analysis and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
plt.style.use('ggplot')
from scipy.special import softmax
from tqdm import tqdm
import torch
from scipy.special import softmax
from IPython.display import display
import nltk
import os

# Roberta model (Sentiment Analysis)

In [93]:
# Load Pre-trained RoBERTa Model for Sentiment
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
roberta_results = []

Google Play Store

In [94]:
# Initialize an empty list to store sentiment scores for each review
roberta_results = []

for review in tqdm(df['review_translated'], total=len(df)):
    # Ensuring the review is a string
    if not isinstance(review, str):
        # Converting non-string values to string
        review = str(review) if review is not None else ''
    
    # Tokenizing the text
    encoded_text = tokenizer(review, return_tensors='pt')
    with torch.no_grad():  
        output = model(**encoded_text)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    # Store the sentiment probabilities (negative, neutral, positive) into a dictionary
    roberta_results.append({
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    })

100%|██████████████████████████████████████████████████████████████████████████| 31093/31093 [1:35:36<00:00,  5.42it/s]


In [95]:
# Converting RoBERTa results to DataFrame
roberta_df = pd.DataFrame(roberta_results)
roberta_df

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
0,0.018334,0.197735,0.783931
1,0.592961,0.393666,0.013373
2,0.836563,0.153653,0.009784
3,0.789655,0.198438,0.011907
4,0.004365,0.079399,0.916236
...,...,...,...
31088,0.004399,0.046016,0.949586
31089,0.058850,0.877014,0.064135
31090,0.008843,0.086688,0.904469
31091,0.005239,0.033344,0.961417


In [96]:
merged_df_2 = pd.concat([df.reset_index(drop=True), roberta_df.reset_index(drop=True)], axis=1)

In [97]:
merged_df_2.head()

Unnamed: 0,userName,content,review_translated,score,at,app_name,language,review_tokens,roberta_neg,roberta_neu,roberta_pos
0,antonius oosterwaal,hello my n 26 always worked great but now it w...,hello n always work great open phone doiti sto...,1,2025-03-13 19:48:08,N26,nl,"[hello, my, n, 26, always, worked, great, but,...",0.018334,0.197735,0.783931
1,jan uit de bulten,"zeer slecht, in geval van fraude willen ze nie...",bad case fraud want help ask provide evidence ...,1,2025-02-28 07:07:42,N26,nl,"[zeer, slecht, ,, in, geval, van, fraude, will...",0.592961,0.393666,0.013373
2,ryudo 300 (ryudo300),compleet drama om te installeren. bij het gede...,complete drama install part take photo app kee...,1,2025-02-26 21:27:51,N26,nl,"[compleet, drama, om, te, installeren, ., bij,...",0.836563,0.153653,0.009784
3,tijs past,"very bad, no contact possible, i sent all need...",bad contact possible send need document never ...,1,2025-02-16 14:15:35,N26,nl,"[very, bad, ,, no, contact, possible, ,, i, se...",0.789655,0.198438,0.011907
4,leo kroonenburg,"new customer since january, really good so far...",new customer since january really good far sti...,5,2025-02-05 10:27:58,N26,nl,"[new, customer, since, january, ,, really, goo...",0.004365,0.079399,0.916236


In [98]:
display(merged_df_2)

Unnamed: 0,userName,content,review_translated,score,at,app_name,language,review_tokens,roberta_neg,roberta_neu,roberta_pos
0,antonius oosterwaal,hello my n 26 always worked great but now it w...,hello n always work great open phone doiti sto...,1,2025-03-13 19:48:08,N26,nl,"[hello, my, n, 26, always, worked, great, but,...",0.018334,0.197735,0.783931
1,jan uit de bulten,"zeer slecht, in geval van fraude willen ze nie...",bad case fraud want help ask provide evidence ...,1,2025-02-28 07:07:42,N26,nl,"[zeer, slecht, ,, in, geval, van, fraude, will...",0.592961,0.393666,0.013373
2,ryudo 300 (ryudo300),compleet drama om te installeren. bij het gede...,complete drama install part take photo app kee...,1,2025-02-26 21:27:51,N26,nl,"[compleet, drama, om, te, installeren, ., bij,...",0.836563,0.153653,0.009784
3,tijs past,"very bad, no contact possible, i sent all need...",bad contact possible send need document never ...,1,2025-02-16 14:15:35,N26,nl,"[very, bad, ,, no, contact, possible, ,, i, se...",0.789655,0.198438,0.011907
4,leo kroonenburg,"new customer since january, really good so far...",new customer since january really good far sti...,5,2025-02-05 10:27:58,N26,nl,"[new, customer, since, january, ,, really, goo...",0.004365,0.079399,0.916236
...,...,...,...,...,...,...,...,...,...,...,...
31088,Een Google-gebruiker,"Goeie uitleg, en het lijkt me zeer fijn om te ...",good explanation think nice use thank ceo ali,5,2015-11-25 19:16:25,Bunq,mixed,"[Goeie, uitleg, ,, en, het, lijkt, me, zeer, f...",0.004399,0.046016,0.949586
31089,Een Google-gebruiker,"De rekening is gratis, alleen een pinpas kost ...",account free debit card cost euro per year cas...,1,2015-11-25 16:49:25,Bunq,mixed,"[De, rekening, is, gratis, ,, alleen, een, pin...",0.058850,0.877014,0.064135
31090,Een Google-gebruiker,Lekker snel geïnstalleerd. Eindelijk een bank ...,nice quick installation finally bank really pu...,5,2015-11-25 13:42:33,Bunq,mixed,"[Lekker, snel, geïnstalleerd, ., Eindelijk, ee...",0.008843,0.086688,0.904469
31091,Een Google-gebruiker,Mooie start bunqers! Registratie was appeltje-...,great start bunqer registration piece cake sup...,4,2015-11-25 12:16:11,Bunq,mixed,"[Mooie, start, bunqers, !, Registratie, was, a...",0.005239,0.033344,0.961417


In [99]:
# Calculate compound
def calculate_roberta_compound(row):
    return row['roberta_pos'] - row['roberta_neg']

# Apply the function
merged_df_2['compound'] = merged_df_2.apply(calculate_roberta_compound, axis=1)

# rename columns
merged_df_2 = merged_df_2.rename(columns={'roberta_neg': 'neg', 'roberta_pos': 'pos'})

# select only the relevant columns
df_selected_2 = merged_df_2[['review_translated', 'score', 'neg', 'pos', 'compound']]

# Display the result
display(df_selected_2)


Unnamed: 0,review_translated,score,neg,pos,compound
0,hello n always work great open phone doiti sto...,1,0.018334,0.783931,0.765597
1,bad case fraud want help ask provide evidence ...,1,0.592961,0.013373,-0.579587
2,complete drama install part take photo app kee...,1,0.836563,0.009784,-0.826779
3,bad contact possible send need document never ...,1,0.789655,0.011907,-0.777749
4,new customer since january really good far sti...,5,0.004365,0.916236,0.911870
...,...,...,...,...,...
31088,good explanation think nice use thank ceo ali,5,0.004399,0.949586,0.945187
31089,account free debit card cost euro per year cas...,1,0.058850,0.064135,0.005285
31090,nice quick installation finally bank really pu...,5,0.008843,0.904469,0.895627
31091,great start bunqer registration piece cake sup...,4,0.005239,0.961417,0.956178


In [100]:
# Add sentiment label column: only positive or negative
df_selected_2 = df_selected_2.copy()
df_selected_2['sentiment_label'] = df_selected_2['compound'].apply(
    lambda x: 'positive' if x >= 0 else 'negative'
)

# Display the updated DataFrame
display(df_selected_2)



Unnamed: 0,review_translated,score,neg,pos,compound,sentiment_label
0,hello n always work great open phone doiti sto...,1,0.018334,0.783931,0.765597,positive
1,bad case fraud want help ask provide evidence ...,1,0.592961,0.013373,-0.579587,negative
2,complete drama install part take photo app kee...,1,0.836563,0.009784,-0.826779,negative
3,bad contact possible send need document never ...,1,0.789655,0.011907,-0.777749,negative
4,new customer since january really good far sti...,5,0.004365,0.916236,0.911870,positive
...,...,...,...,...,...,...
31088,good explanation think nice use thank ceo ali,5,0.004399,0.949586,0.945187,positive
31089,account free debit card cost euro per year cas...,1,0.058850,0.064135,0.005285,positive
31090,nice quick installation finally bank really pu...,5,0.008843,0.904469,0.895627,positive
31091,great start bunqer registration piece cake sup...,4,0.005239,0.961417,0.956178,positive


AppStore

In [101]:
display(df_AppStore)

Unnamed: 0,date,userName,review,review_translated,rating,developerResponse,app_name,review_tokens
0,2020-04-15 13:31:45,olan.acc,valeriol7860 de code om 15 euro op de kaart te...,valeriol code get euros card every transaction...,5,,N26,"[valeriol7860, de, code, om, 15, euro, op, de,..."
2,2020-09-11 14:06:24,Virgie1234,Eigenlijk wilde ik N26 gebruiken als vaste las...,actually want use fix cost account mortgage pr...,3,"{'id': 21541639, 'body': ""Hey, thank you for y...",N26,"[Eigenlijk, wilde, ik, N26, gebruiken, als, va..."
3,2018-02-19 19:45:25,meeljeme,Of you guys want to take off here in The Nethe...,guy want take need support dutch ideal payment...,4,,N26,"[Of, you, guys, want, to, take, off, here, in,..."
4,2018-05-15 18:01:27,Rumbl3Shot,Help me! I want to verify my account but when ...,help want verify account open camera snap card...,1,"{'id': 3541420, 'body': ""Hey! We're sorry abou...",N26,"[Help, me, !, I, want, to, verify, my, account..."
5,2023-02-07 03:57:37,DVA013,N26 and Revolut are the reason why more and mo...,reason people choose decentralized finance ban...,1,"{'id': 34968254, 'body': ""Hey! We'd love to le...",N26,"[N26, and, Revolut, are, the, reason, why, mor..."
...,...,...,...,...,...,...,...,...
2550,2020-02-11 21:00:56,user20120,Na update van vandaag. Kan ik niet neer inlog...,todays update log I force set access code phon...,1,,Revolut,"[Na, update, van, vandaag, ., , Kan, ik, niet..."
2551,2020-02-08 17:59:13,vlad.82.82.82,Account locked 2 month ago. Any information wh...,account lock month ago information unlocked su...,1,,Revolut,"[Account, locked, 2, month, ago, ., Any, infor..."
2552,2020-02-07 14:13:59,Unnamed Resource,"Fix this, there are no other ways to add money...",fix way add money credit card,2,,Revolut,"[Fix, this, ,, there, are, no, other, ways, to..."
2553,2020-01-29 12:35:37,joelielie,The app has a better lay-out than N26 to be ho...,app well layout honest customer service top no...,5,,Revolut,"[The, app, has, a, better, lay, -, out, than, ..."


In [102]:
roberta_results_AppStore = []

for review in tqdm(df_AppStore['review_translated'], total=len(df_AppStore)):
    if not isinstance(review, str):
        review = str(review) if review is not None else ''
    
    encoded_text = tokenizer(review, return_tensors='pt')
    with torch.no_grad():
        output = model(**encoded_text)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    roberta_results_AppStore.append({
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    })


100%|██████████████████████████████████████████████████████████████████████████████| 1997/1997 [05:58<00:00,  5.57it/s]


In [103]:
# convert to DataFrame
roberta_df_AppStore = pd.DataFrame(roberta_results_AppStore)
display(roberta_df_AppStore)

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
0,0.004308,0.053282,0.942410
1,0.106861,0.682844,0.210295
2,0.010568,0.250077,0.739355
3,0.456006,0.509097,0.034896
4,0.048452,0.800474,0.151075
...,...,...,...
1992,0.043151,0.923262,0.033588
1993,0.669514,0.317507,0.012980
1994,0.157278,0.764027,0.078695
1995,0.014470,0.118039,0.867491


In [104]:
# Merging RoBERTa results with original Dataset
merged_df_AppStore = pd.concat([df_AppStore.reset_index(drop=True), roberta_df_AppStore.reset_index(drop=True)], axis=1)

In [105]:
merged_df_AppStore.head()

Unnamed: 0,date,userName,review,review_translated,rating,developerResponse,app_name,review_tokens,roberta_neg,roberta_neu,roberta_pos
0,2020-04-15 13:31:45,olan.acc,valeriol7860 de code om 15 euro op de kaart te...,valeriol code get euros card every transaction...,5,,N26,"[valeriol7860, de, code, om, 15, euro, op, de,...",0.004308,0.053282,0.94241
1,2020-09-11 14:06:24,Virgie1234,Eigenlijk wilde ik N26 gebruiken als vaste las...,actually want use fix cost account mortgage pr...,3,"{'id': 21541639, 'body': ""Hey, thank you for y...",N26,"[Eigenlijk, wilde, ik, N26, gebruiken, als, va...",0.106861,0.682844,0.210295
2,2018-02-19 19:45:25,meeljeme,Of you guys want to take off here in The Nethe...,guy want take need support dutch ideal payment...,4,,N26,"[Of, you, guys, want, to, take, off, here, in,...",0.010568,0.250077,0.739355
3,2018-05-15 18:01:27,Rumbl3Shot,Help me! I want to verify my account but when ...,help want verify account open camera snap card...,1,"{'id': 3541420, 'body': ""Hey! We're sorry abou...",N26,"[Help, me, !, I, want, to, verify, my, account...",0.456006,0.509097,0.034896
4,2023-02-07 03:57:37,DVA013,N26 and Revolut are the reason why more and mo...,reason people choose decentralized finance ban...,1,"{'id': 34968254, 'body': ""Hey! We'd love to le...",N26,"[N26, and, Revolut, are, the, reason, why, mor...",0.048452,0.800474,0.151075


In [106]:
# Function to calculate compound score
def calculate_roberta_compound(row):
    return row['roberta_pos'] - row['roberta_neg']

In [107]:
# Apply the function to create the compound score column
merged_df_AppStore['compound'] = merged_df_AppStore.apply(calculate_roberta_compound, axis=1)

# Rename columns for consistency with requested format
merged_df_AppStore = merged_df_AppStore.rename(columns={'roberta_neg': 'neg', 'roberta_pos': 'pos'})

# Select only the required columns
df_selected_AppStore = merged_df_AppStore[['review_translated', 'rating', 'neg', 'pos', 'compound']]

# Display the DataFrame with selected columns
display(df_selected_AppStore)

Unnamed: 0,review_translated,rating,neg,pos,compound
0,valeriol code get euros card every transaction...,5,0.004308,0.942410,0.938102
1,actually want use fix cost account mortgage pr...,3,0.106861,0.210295,0.103435
2,guy want take need support dutch ideal payment...,4,0.010568,0.739355,0.728786
3,help want verify account open camera snap card...,1,0.456006,0.034896,-0.421110
4,reason people choose decentralized finance ban...,1,0.048452,0.151075,0.102623
...,...,...,...,...,...
1992,todays update log I force set access code phon...,1,0.043151,0.033588,-0.009563
1993,account lock month ago information unlocked su...,1,0.669514,0.012980,-0.656534
1994,fix way add money credit card,2,0.157278,0.078695,-0.078583
1995,app well layout honest customer service top no...,5,0.014470,0.867491,0.853021


In [108]:
# Make sure you're working with a copy to avoid warnings
df_selected_AppStore = df_selected_AppStore.copy()

# Add sentiment label column: only positive or negative
df_selected_AppStore['sentiment_label'] = df_selected_AppStore['compound'].apply(
    lambda x: 'positive' if x >= 0 else 'negative'
)

# Display the updated DataFrame
display(df_selected_AppStore)


Unnamed: 0,review_translated,rating,neg,pos,compound,sentiment_label
0,valeriol code get euros card every transaction...,5,0.004308,0.942410,0.938102,positive
1,actually want use fix cost account mortgage pr...,3,0.106861,0.210295,0.103435,positive
2,guy want take need support dutch ideal payment...,4,0.010568,0.739355,0.728786,positive
3,help want verify account open camera snap card...,1,0.456006,0.034896,-0.421110,negative
4,reason people choose decentralized finance ban...,1,0.048452,0.151075,0.102623,positive
...,...,...,...,...,...,...
1992,todays update log I force set access code phon...,1,0.043151,0.033588,-0.009563,negative
1993,account lock month ago information unlocked su...,1,0.669514,0.012980,-0.656534,negative
1994,fix way add money credit card,2,0.157278,0.078695,-0.078583,negative
1995,app well layout honest customer service top no...,5,0.014470,0.867491,0.853021,positive


Reddit

In [109]:
roberta_results_Reddit = []

for review in tqdm(df_Reddit['review_translated'], total=len(df_Reddit)):
    # Ensuring the review is a string
    if not isinstance(review, str):
        # Converting non-string values to string
        review = str(review) if review is not None else ''
    
    # Tokenizing the text
    encoded_text = tokenizer(review, return_tensors='pt')
    with torch.no_grad():  
        output = model(**encoded_text)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    roberta_results_Reddit.append({
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    })

100%|██████████████████████████████████████████████████████████████████████████████| 5638/5638 [17:32<00:00,  5.36it/s]


In [110]:
# Converting RoBERTa results to DataFrame
roberta_df_Reddit = pd.DataFrame(roberta_results_Reddit)
roberta_df_Reddit

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
0,0.008064,0.396560,0.595376
1,0.004009,0.288746,0.707245
2,0.028936,0.839906,0.131158
3,0.230623,0.701298,0.068078
4,0.013187,0.108549,0.878264
...,...,...,...
5633,0.027489,0.769657,0.202853
5634,0.014738,0.543055,0.442206
5635,0.036221,0.644896,0.318883
5636,0.016112,0.818925,0.164964


In [111]:
# Merging RoBERTa results with original Dataset
merged_df_Reddit = pd.concat([df_Reddit.reset_index(drop=True), roberta_df_Reddit.reset_index(drop=True)], axis=1)

In [112]:
merged_df_Reddit.head()

Unnamed: 0,post_title,comment,review_translated,score,author,permalink,app_name,review_tokens,roberta_neg,roberta_neu,roberta_pos
0,When catastrophe strikes: iDeal & PayPal not s...,There are some online credit cards that are pe...,online credit card perfect type infrequent pur...,2,Pacpav,https://www.reddit.com/r/Netherlands/comments/...,N26,"[There, are, some, online, credit, cards, that...",0.008064,0.39656,0.595376
1,When catastrophe strikes: iDeal & PayPal not s...,"I use N26, they offer a free debit card from M...",use offer free debit card mastercard nearly be...,3,faabmaster,https://www.reddit.com/r/Netherlands/comments/...,N26,"[I, use, N26, ,, they, offer, a, free, debit, ...",0.004009,0.288746,0.707245
2,When catastrophe strikes: iDeal & PayPal not s...,"Either get a credit card from your bank, or si...",either get credit card bank sign free transfer...,92,DaHaunter,https://www.reddit.com/r/Netherlands/comments/...,N26,"[Either, get, a, credit, card, from, your, ban...",0.028936,0.839906,0.131158
3,When catastrophe strikes: iDeal & PayPal not s...,"i use n26 myself, never really had an issue so...",use never really issue far,2,Useful-Position-4445,https://www.reddit.com/r/Netherlands/comments/...,N26,"[i, use, n26, myself, ,, never, really, had, a...",0.230623,0.701298,0.068078
4,When catastrophe strikes: iDeal & PayPal not s...,I use N26. Only issue I have is that transferr...,use issue transfer money take bit time day wou...,2,MicrochippedByGates,https://www.reddit.com/r/Netherlands/comments/...,N26,"[I, use, N26, ., Only, issue, I, have, is, tha...",0.013187,0.108549,0.878264


In [113]:
# Function to calculate compound score
def calculate_roberta_compound(row):
    return row['roberta_pos'] - row['roberta_neg']

In [114]:
# Apply the function to create the compound score column
merged_df_Reddit['compound'] = merged_df_Reddit.apply(calculate_roberta_compound, axis=1)

# Rename columns for consistency with requested format
merged_df_Reddit = merged_df_Reddit.rename(columns={'roberta_neg': 'neg', 'roberta_pos': 'pos'})

# Select only the required columns
df_selected_Reddit = merged_df_Reddit[['review_translated', 'neg', 'pos', 'compound']]

# Display the DataFrame with selected columns
display(df_selected_Reddit)

Unnamed: 0,review_translated,neg,pos,compound
0,online credit card perfect type infrequent pur...,0.008064,0.595376,0.587312
1,use offer free debit card mastercard nearly be...,0.004009,0.707245,0.703236
2,either get credit card bank sign free transfer...,0.028936,0.131158,0.102221
3,use never really issue far,0.230623,0.068078,-0.162545
4,use issue transfer money take bit time day wou...,0.013187,0.878264,0.865077
...,...,...,...,...
5633,app like transferwise generally let save inter...,0.027489,0.202853,0.175364
5634,obviously transfer fund nl guess want know fas...,0.014738,0.442206,0.427468
5635,use forex transfer fund check see work opposit...,0.036221,0.318883,0.282662
5636,I use transferwise reverse direction cheap use...,0.016112,0.164964,0.148852


In [115]:
# Make sure you're working with a copy to avoid warnings
df_selected_Reddit = df_selected_Reddit.copy()

# Add sentiment label column: only positive or negative
df_selected_Reddit['sentiment_label'] = df_selected_Reddit['compound'].apply(
    lambda x: 'positive' if x >= 0 else 'negative'
)

# Display the updated DataFrame
display(df_selected_Reddit)

Unnamed: 0,review_translated,neg,pos,compound,sentiment_label
0,online credit card perfect type infrequent pur...,0.008064,0.595376,0.587312,positive
1,use offer free debit card mastercard nearly be...,0.004009,0.707245,0.703236,positive
2,either get credit card bank sign free transfer...,0.028936,0.131158,0.102221,positive
3,use never really issue far,0.230623,0.068078,-0.162545,negative
4,use issue transfer money take bit time day wou...,0.013187,0.878264,0.865077,positive
...,...,...,...,...,...
5633,app like transferwise generally let save inter...,0.027489,0.202853,0.175364,positive
5634,obviously transfer fund nl guess want know fas...,0.014738,0.442206,0.427468,positive
5635,use forex transfer fund check see work opposit...,0.036221,0.318883,0.282662,positive
5636,I use transferwise reverse direction cheap use...,0.016112,0.164964,0.148852,positive


Knab

In [116]:
# Store results here
roberta_results_knab = []

# Loop through each review
for review in tqdm(df_knab_reviews['review_translated'], total=len(df_knab_reviews)):
    if not isinstance(review, str):
        review = str(review) if review is not None else ''
    
    encoded_text = tokenizer(review, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**encoded_text)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    roberta_results_knab.append({
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    })


100%|██████████████████████████████████████████████████████████████████████████████| 1744/1744 [03:55<00:00,  7.41it/s]


In [117]:
# Converting RoBERTa results to DataFrame
roberta_results_knab = pd.DataFrame(roberta_results_knab)
roberta_results_knab

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
0,0.285222,0.664464,0.050315
1,0.299579,0.556666,0.143755
2,0.091356,0.602384,0.306260
3,0.123865,0.809424,0.066711
4,0.479393,0.485736,0.034871
...,...,...,...
1739,0.152290,0.637719,0.209991
1740,0.782278,0.206731,0.010991
1741,0.072449,0.636687,0.290864
1742,0.547671,0.414486,0.037843


In [118]:
# Merging RoBERTa results with original Dataset
merged_df_knab = pd.concat([df_knab_reviews.reset_index(drop=True), roberta_results_knab.reset_index(drop=True)], axis=1)

In [119]:
# Function to calculate compound score
def calculate_roberta_compound(row):
    return row['roberta_pos'] - row['roberta_neg']

In [120]:
# Apply the function to create the compound score column
merged_df_knab['compound'] = merged_df_knab.apply(calculate_roberta_compound, axis=1)

# Rename columns for consistency with requested format
merged_df_knab = merged_df_knab.rename(columns={'roberta_neg': 'neg', 'roberta_pos': 'pos'})

# Select only the required columns
df_selected_knab = merged_df_knab[['review_translated', 'neg', 'pos', 'compound']]

# Display the DataFrame with selected columns
display(df_selected_knab)

Unnamed: 0,review_translated,neg,pos,compound
0,app support contactless payment must go pay ra...,0.285222,0.050315,-0.234907
1,recommend many update hassle little possible w...,0.299579,0.143755,-0.155824
2,bad acceptance dutch legal form would make wel...,0.091356,0.306260,0.214904
3,customer aegon transfer scan everything still get,0.123865,0.066711,-0.057154
4,work every weekend saturday night sunday night...,0.479393,0.034871,-0.444522
...,...,...,...,...
1739,installing say save plenty room app instal wit...,0.152290,0.209991,0.057700
1740,install fail install usb storage sd card app p...,0.782278,0.010991,-0.771287
1741,good reliable plenty function overload app acc...,0.072449,0.290864,0.218415
1742,need phone connection active work wifi mode st...,0.547671,0.037843,-0.509828


In [121]:
# Make sure you're working with a copy to avoid warnings
df_selected_knab = df_selected_knab.copy()

# Add sentiment label column: only positive or negative
df_selected_knab['sentiment_label'] = df_selected_knab['compound'].apply(
    lambda x: 'positive' if x >= 0 else 'negative'
)

# Display the updated DataFrame
display(df_selected_knab)

Unnamed: 0,review_translated,neg,pos,compound,sentiment_label
0,app support contactless payment must go pay ra...,0.285222,0.050315,-0.234907,negative
1,recommend many update hassle little possible w...,0.299579,0.143755,-0.155824,negative
2,bad acceptance dutch legal form would make wel...,0.091356,0.306260,0.214904,positive
3,customer aegon transfer scan everything still get,0.123865,0.066711,-0.057154,negative
4,work every weekend saturday night sunday night...,0.479393,0.034871,-0.444522,negative
...,...,...,...,...,...
1739,installing say save plenty room app instal wit...,0.152290,0.209991,0.057700,positive
1740,install fail install usb storage sd card app p...,0.782278,0.010991,-0.771287,negative
1741,good reliable plenty function overload app acc...,0.072449,0.290864,0.218415,positive
1742,need phone connection active work wifi mode st...,0.547671,0.037843,-0.509828,negative


Extra Reddit

In [122]:
# Store results here
roberta_extra_reddit = []

# Loop through each review
for review in tqdm(df_Merged_extra_Reddit_Reviews[' comment'], total=len(df_Merged_extra_Reddit_Reviews)):
    if not isinstance(review, str):
        review = str(review) if review is not None else ''
    
    encoded_text = tokenizer(review, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**encoded_text)
    
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    roberta_extra_reddit.append({
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    })


100%|██████████████████████████████████████████████████████████████████████████████| 1164/1164 [02:37<00:00,  7.37it/s]


In [123]:
# Converting RoBERTa results to DataFrame
roberta_extra_reddit = pd.DataFrame(roberta_extra_reddit)
roberta_extra_reddit

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
0,0.136019,0.814478,0.049503
1,0.006998,0.188941,0.804061
2,0.020025,0.406317,0.573657
3,0.030967,0.424845,0.544188
4,0.319773,0.607174,0.073053
...,...,...,...
1159,0.056071,0.856287,0.087642
1160,0.054435,0.776239,0.169326
1161,0.004969,0.142675,0.852356
1162,0.006711,0.070999,0.922290


In [124]:
# Merging RoBERTa results with original Dataset
df_Merged_extra_Reddit_Reviews = pd.concat([df_Merged_extra_Reddit_Reviews.reset_index(drop=True), roberta_extra_reddit.reset_index(drop=True)], axis=1)

In [125]:
# Function to calculate compound score
def calculate_roberta_compound(row):
    return row['roberta_pos'] - row['roberta_neg']

In [126]:
# Apply the function to create the compound score column
df_Merged_extra_Reddit_Reviews['compound'] = df_Merged_extra_Reddit_Reviews.apply(calculate_roberta_compound, axis=1)

# Rename columns for consistency with requested format
df_Merged_extra_Reddit_Reviews = df_Merged_extra_Reddit_Reviews.rename(columns={'roberta_neg': 'neg', 'roberta_pos': 'pos'})

# Select only the required columns
df_selected_ex_reddit = df_Merged_extra_Reddit_Reviews[[' comment', 'neg', 'pos', 'compound']]

# Display the DataFrame with selected columns
display(df_selected_ex_reddit)

Unnamed: 0,comment,neg,pos,compound
0,digital free dump purchase amount card pay,0.136019,0.049503,-0.086517
1,often choose mastercsrdvisa payment screen mat...,0.006998,0.804061,0.797063
2,offer free debit card actually use possible in...,0.020025,0.573657,0.553632
3,free bank account germany free debit mastercar...,0.030967,0.544188,0.513222
4,account german bank apply virtual credit card ...,0.319773,0.073053,-0.246719
...,...,...,...,...
1159,pay fix amount unlimited number account card,0.056071,0.087642,0.031571
1160,affordable pay check account also open many ch...,0.054435,0.169326,0.114891
1161,would stick one big bank always good rabo sns ...,0.004969,0.852356,0.847387
1162,also go excellent service good interest rate,0.006711,0.922290,0.915579


In [127]:
# Make sure you're working with a copy to avoid warnings
df_selected_ex_reddit = df_selected_ex_reddit.copy()

# Add sentiment label column: only positive or negative
df_selected_ex_reddit['sentiment_label'] = df_selected_ex_reddit['compound'].apply(
    lambda x: 'positive' if x >= 0 else 'negative'
)

# Display the updated DataFrame
display(df_selected_knab)

Unnamed: 0,review_translated,neg,pos,compound,sentiment_label
0,app support contactless payment must go pay ra...,0.285222,0.050315,-0.234907,negative
1,recommend many update hassle little possible w...,0.299579,0.143755,-0.155824,negative
2,bad acceptance dutch legal form would make wel...,0.091356,0.306260,0.214904,positive
3,customer aegon transfer scan everything still get,0.123865,0.066711,-0.057154,negative
4,work every weekend saturday night sunday night...,0.479393,0.034871,-0.444522,negative
...,...,...,...,...,...
1739,installing say save plenty room app instal wit...,0.152290,0.209991,0.057700,positive
1740,install fail install usb storage sd card app p...,0.782278,0.010991,-0.771287,negative
1741,good reliable plenty function overload app acc...,0.072449,0.290864,0.218415,positive
1742,need phone connection active work wifi mode st...,0.547671,0.037843,-0.509828,negative


In [128]:
df_df_selected_ex_reddit.rename(columns={" comment": "review_translated"}, inplace=True)
display(df_selected_ex_reddit)

NameError: name 'df_df_selected_ex_reddit' is not defined

Merge tables:

In [129]:
#Combine all three datasets
df_all = pd.concat([df_selected_2, df_selected_AppStore, df_selected_Reddit, df_selected_ex_reddit, df_selected_knab], ignore_index=True)

In [130]:
display(df_all)

Unnamed: 0,review_translated,score,neg,pos,compound,sentiment_label,rating,comment
0,hello n always work great open phone doiti sto...,1,0.018334,0.783931,0.765597,positive,,
1,bad case fraud want help ask provide evidence ...,1,0.592961,0.013373,-0.579587,negative,,
2,complete drama install part take photo app kee...,1,0.836563,0.009784,-0.826779,negative,,
3,bad contact possible send need document never ...,1,0.789655,0.011907,-0.777749,negative,,
4,new customer since january really good far sti...,5,0.004365,0.916236,0.911870,positive,,
...,...,...,...,...,...,...,...,...
41631,installing say save plenty room app instal wit...,,0.152290,0.209991,0.057700,positive,,
41632,install fail install usb storage sd card app p...,,0.782278,0.010991,-0.771287,negative,,
41633,good reliable plenty function overload app acc...,,0.072449,0.290864,0.218415,positive,,
41634,need phone connection active work wifi mode st...,,0.547671,0.037843,-0.509828,negative,,


In [85]:
# Save to CSV
df_all_2.to_csv("all_Public_reviews_sentiment_2.csv", index=False)

In [86]:
# Define your folder path (change this to your desired folder)
folder_path_Public = r"C:\Users\nklom\OneDrive\Pictures\Documents\Master\Thesis\Data"  # For Windows
# folder_path = "/Users/YourName/Documents/SentimentResults"  # For Mac/Linux

# Make sure the folder exists (optional, creates it if it doesn't)
os.makedirs(folder_path_Public, exist_ok=True)

# Define full file path
file_path_Public = os.path.join(folder_path_Public, "all_Public_reviews_sentiment_2.csv")

# Save the DataFrame to the specified location
df_all_2.to_csv(file_path_Public, index=False)

print(f"File saved to: {file_path_Public}")


File saved to: C:\Users\nklom\OneDrive\Pictures\Documents\Master\Thesis\Data\all_Public_reviews_sentiment_2.csv


# Visualize results Roberta

In [117]:
# Combine all three into one DataFrame
df_merged_ALL = pd.concat(
    [df_selected_AppStore, df_selected_2, df_selected_Reddit],
    axis=0,           # Stack rows (not columns)
    ignore_index=True # Reset index to avoid duplication
)

# Optional: preview result
print(df_merged_ALL.shape)
df_merged_ALL.head()


(30638, 7)


Unnamed: 0,review_translated,rating,neg,pos,compound,sentiment_label,score
0,valeriol code get euros card every transaction...,5.0,0.004194,0.941279,0.937084,positive,
1,actually want use n fix cost account mortgage ...,3.0,0.097076,0.203502,0.106427,positive,
2,guy want take netherlands need support dutch i...,4.0,0.01129,0.723509,0.71222,positive,
3,help want verify account open camera snap card...,1.0,0.456006,0.034896,-0.42111,negative,
4,n revolut reason people choose decentralized f...,1.0,0.069676,0.09269,0.023014,positive,


In [119]:
merged_df.columns

Index(['date', 'userName', 'review', 'rating_x', 'developerResponse',
       'review_translated_x', 'Sentiment_Score', 'negV', 'neuV', 'posV',
       'compoundV', 'neg_x', 'roberta_neu', 'pos_x', 'roberta_compound',
       'roberta_sentiment_dict', 'compound_x', 'review_translated_y',
       'rating_y', 'neg_y', 'pos_y', 'compound_y'],
      dtype='object')