In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # make detection deterministic

## Reddit Data

In [18]:
reddit_df = pd.read_csv('../data/reddit_crypto_data.csv')
reddit_df

Unnamed: 0,id,title,text,score,created_utc,author,num_comments,subreddit,permalink,type,platform,sentiment,parent_id
0,n9cby0,Not every new coin is a shitcoin: How to spot ...,"A few days ago, I made a post titled *""Rugpull...",20345,2021-05-11 02:29:24,hazelvelvet,2467.0,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,submission,binance,positive,
1,gxomc6o,,Great post. I have a few questions. \n\n1) How...,12,2021-05-11 10:15:20,hoti0101,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,positive,t3_n9cby0
2,gxnksae,,> Start trading on the BSC (Binance) Smart Cha...,59,2021-05-11 05:05:51,fakesteez,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,positive,t3_n9cby0
3,gxsnol3,,"okay so i bought into 3 coins, around $10 tota...",3,2021-05-12 08:13:54,still_alive11,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,positive,t3_n9cby0
4,gxnazsz,,Isn’t the minimum purchase set to $15 on binance?,7,2021-05-11 03:55:45,BadAssPleb,,CryptoCurrency,https://reddit.com/r/CryptoCurrency/comments/n...,comment,binance,neutral,t3_n9cby0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40523,dsx2lhp,,in my experience the Airbnbs were never that b...,2,2018-01-19 21:42:20,WintheGym,,Ethereum,https://reddit.com/r/ethereum/comments/7rgomk/...,comment,binance,negative,t1_dsx2a7l
40524,dsxh96e,,"""what is stopping AirBnB from integrating it, ...",3,2018-01-20 01:58:00,cat-gun,,Ethereum,https://reddit.com/r/ethereum/comments/7rgomk/...,comment,binance,positive,t1_dsx773j
40525,dsxpept,,"There's much more to it than accepting crypto,...",2,2018-01-20 04:11:02,softestcore,,Ethereum,https://reddit.com/r/ethereum/comments/7rgomk/...,comment,binance,positive,t1_dsxmaqn
40526,dsyo6mm,,Are these some of the same reasons hosts might...,2,2018-01-20 18:10:51,CyrilsJungleHat,,Ethereum,https://reddit.com/r/ethereum/comments/7rgomk/...,comment,binance,neutral,t1_dsxh96e


In [19]:
# Lets see the unique values in the column 'type'
print(reddit_df['type'].unique())

['submission' 'comment']


In [20]:
# Filter if necessary, on second thought I think its good to keep submissions.
# Submissions are generally more insightful with more information.
# df = df[df['type'] == 'comment']

# Drop rows with missing text of sentiment values
reddit_df = reddit_df.dropna(subset=['text', 'sentiment'])

# Combine title and text if both exist
reddit_df['text'] = reddit_df.apply(
    lambda row: (str(row['title']) + " " + str(row['text']))
    if pd.notna(row['title']) and str(row['title']).strip() != str(row['text']).strip()
    else str(row['text']),
    axis=1
)

# Next lets drop unnecessary columns
reddit_df = reddit_df.drop(columns=['title', 'id', 'author', 'num_comments', 'permalink', 'type', 'parent_id'])

# Drop duplicates
reddit_df = reddit_df.drop_duplicates()
reddit_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit_df['text'] = reddit_df.apply(


Unnamed: 0,text,score,created_utc,subreddit,platform,sentiment
0,Not every new coin is a shitcoin: How to spot ...,20345,2021-05-11 02:29:24,CryptoCurrency,binance,positive
1,Great post. I have a few questions. \n\n1) How...,12,2021-05-11 10:15:20,CryptoCurrency,binance,positive
2,> Start trading on the BSC (Binance) Smart Cha...,59,2021-05-11 05:05:51,CryptoCurrency,binance,positive
3,"okay so i bought into 3 coins, around $10 tota...",3,2021-05-12 08:13:54,CryptoCurrency,binance,positive
4,Isn’t the minimum purchase set to $15 on binance?,7,2021-05-11 03:55:45,CryptoCurrency,binance,neutral
...,...,...,...,...,...,...
40523,in my experience the Airbnbs were never that b...,2,2018-01-19 21:42:20,Ethereum,binance,negative
40524,"""what is stopping AirBnB from integrating it, ...",3,2018-01-20 01:58:00,Ethereum,binance,positive
40525,"There's much more to it than accepting crypto,...",2,2018-01-20 04:11:02,Ethereum,binance,positive
40526,Are these some of the same reasons hosts might...,2,2018-01-20 18:10:51,Ethereum,binance,neutral


In [21]:
# Datetime handling
reddit_df['created_utc'] = pd.to_datetime(reddit_df['created_utc']).dt.date
reddit_df

Unnamed: 0,text,score,created_utc,subreddit,platform,sentiment
0,Not every new coin is a shitcoin: How to spot ...,20345,2021-05-11,CryptoCurrency,binance,positive
1,Great post. I have a few questions. \n\n1) How...,12,2021-05-11,CryptoCurrency,binance,positive
2,> Start trading on the BSC (Binance) Smart Cha...,59,2021-05-11,CryptoCurrency,binance,positive
3,"okay so i bought into 3 coins, around $10 tota...",3,2021-05-12,CryptoCurrency,binance,positive
4,Isn’t the minimum purchase set to $15 on binance?,7,2021-05-11,CryptoCurrency,binance,neutral
...,...,...,...,...,...,...
40523,in my experience the Airbnbs were never that b...,2,2018-01-19,Ethereum,binance,negative
40524,"""what is stopping AirBnB from integrating it, ...",3,2018-01-20,Ethereum,binance,positive
40525,"There's much more to it than accepting crypto,...",2,2018-01-20,Ethereum,binance,positive
40526,Are these some of the same reasons hosts might...,2,2018-01-20,Ethereum,binance,neutral


In [24]:
CRYPTO_EXCHANGES = {
    'binance': ['binance', 'bnb', 'binance us', 'binance app', 'binance exchange'],
    'coinbase': ['coinbase', 'coinbase pro', 'coinbase wallet', 'cb wallet'],
    'kraken': ['kraken', 'kraken exchange', 'kraken pro'],
    'okx': ['okx', 'okex'],
    'kucoin': ['kucoin', 'kucoin exchange'],
    'crypto.com': ['crypto.com', 'crypto.com app'],
    'bybit': ['bybit', 'bybit app']
}

# Function to detect all mentioned exchanges in a text
def detect_all_exchanges(text):
    text = text.lower()
    found = set()
    for exchange, keywords in CRYPTO_EXCHANGES.items():
        for keyword in keywords:
            if keyword in text:
                found.add(exchange)
                break
    return ', '.join(sorted(found)) if found else 'general'

# Apply across all rows and overwrite the 'platform' column
reddit_df['platform'] = reddit_df['text'].apply(detect_all_exchanges)
print(reddit_df['platform'].unique())

['binance' 'coinbase' 'kraken' 'binance, coinbase' 'crypto.com'
 'coinbase, kraken' 'binance, crypto.com' 'binance, coinbase, crypto.com'
 'binance, coinbase, kraken' 'binance, kraken' 'crypto.com, kraken'
 'kucoin' 'binance, kucoin' 'kraken, kucoin'
 'binance, bybit, coinbase, kraken' 'binance, coinbase, kucoin'
 'coinbase, crypto.com, kraken' 'binance, coinbase, kraken, kucoin'
 'binance, crypto.com, kraken' 'coinbase, crypto.com'
 'binance, kraken, kucoin' 'coinbase, kraken, kucoin'
 'binance, coinbase, crypto.com, kraken' 'crypto.com, kucoin'
 'coinbase, crypto.com, kucoin' 'bybit' 'binance, bybit, coinbase'
 'binance, crypto.com, kucoin' 'binance, crypto.com, kraken, kucoin'
 'binance, bybit' 'binance, okx' 'coinbase, kucoin'
 'bybit, coinbase, kraken' 'binance, coinbase, crypto.com, kucoin'
 'binance, coinbase, okx' 'binance, kraken, kucoin, okx'
 'binance, crypto.com, kraken, kucoin, okx' 'bybit, coinbase, kucoin'
 'coinbase, okx' 'binance, coinbase, crypto.com, kraken, kucoin' 

## App Store Data

In [25]:
appstore_df = pd.read_csv('../data/app_crypto_data.csv')
appstore_df

Unnamed: 0,exchange,platform,user,rating,date,text,sentiment
0,binance,ios,James4all,5,2023-10-04 08:23:28,How can someone set stop lose and take profit ...,negative
1,binance,ios,Zionsb,1,2023-05-24 00:13:10,"I have been using Binance for a little bit, ev...",positive
2,binance,ios,Username: Courtney,2,2021-04-11 18:25:51,The only thing good about the app is it’s inte...,positive
3,binance,ios,joshdammit,1,2023-06-15 10:57:52,Got an email last week saying binance is shutt...,positive
4,binance,ios,King Banks M,1,2021-05-05 17:13:25,I’ve been trying to buy crypto for a while now...,positive
...,...,...,...,...,...,...,...
1724,bybit,android,Ahsan Ali,5,2025-03-03 17:27:34,Never seen such a fast and reliable exchange w...,positive
1725,bybit,android,Bala M Sarki,3,2025-03-03 17:13:58,Why is it that opay payment reduce our money,neutral
1726,bybit,android,Jesu Ponippase,5,2025-03-03 17:13:10,Supper easy trading platform,positive
1727,bybit,android,Susan Metalor,5,2025-03-03 16:04:41,Great platform.,positive


In [26]:
# Drop unnecessary columns
appstore_df = appstore_df.drop(columns=['user'])

# Drop rows with missing text or sentiment values
appstore_df = appstore_df.dropna(subset=['text', 'sentiment'])

# Drop rows with text length <20
appstore_df = appstore_df[appstore_df['text'].str.split().str.len() >= 20]

In [27]:
# Datetime handling
appstore_df['date'] = pd.to_datetime(appstore_df['date']).dt.date
appstore_df

Unnamed: 0,exchange,platform,rating,date,text,sentiment
0,binance,ios,5,2023-10-04,How can someone set stop lose and take profit ...,negative
1,binance,ios,1,2023-05-24,"I have been using Binance for a little bit, ev...",positive
2,binance,ios,2,2021-04-11,The only thing good about the app is it’s inte...,positive
3,binance,ios,1,2023-06-15,Got an email last week saying binance is shutt...,positive
4,binance,ios,1,2021-05-05,I’ve been trying to buy crypto for a while now...,positive
...,...,...,...,...,...,...
1709,bybit,android,1,2025-03-04,"Bybit lacks many things, fast support customer...",positive
1712,bybit,android,4,2025-03-04,"I liked the app, it's really amazing, there's ...",positive
1714,bybit,android,3,2025-03-04,Bybit is a game changer in the world of crypto...,positive
1715,bybit,android,3,2025-03-04,"Not too good, not too bad, try to make it more...",positive


## TrustPilot Reviews Data

In [28]:
tpilot_df = pd.read_csv('../data/trustpilot_crypto_data.csv')
tpilot_df

Unnamed: 0,Exchange,Username,Date,Review,Rating,Sentiment
0,binance,Akm LvL,"March 20, 2025",your account with your balance will disappear ...,1,neutral
1,binance,Rubeek,"March 20, 2025",It should improve for retailers instead of jus...,3,neutral
2,binance,Sophia,"March 21, 2025",I have been using this platform for few years ...,3,positive
3,binance,jaco antony,"March 01, 2025",man i had headache to withdrawl,3,neutral
4,binance,John Haines,"January 01, 2025","Getting money in is easy, but out is ridiculou...",1,positive
...,...,...,...,...,...,...
12008,bybit,Jckscott,"May 21, 2022",I loved my time trading on this website everyt...,5,positive
12009,bybit,Luca,"October 13, 2022",I've Lost more than 900$ for a long order clos...,1,positive
12010,bybit,GAME CHANGER,"October 11, 2022",Scam exchange has scammed me 114$ on p2p and n...,1,neutral
12011,bybit,Mircea,"August 24, 2022",My money were lost via P2P transaction.The sel...,1,negative


In [29]:
# Drop unnecessary columns
tpilot_df = tpilot_df.drop(columns=['Username'])

# Drop rows with missing review or sentiment values
tpilot_df = tpilot_df.dropna(subset=['Review', 'Sentiment'])

# Drop rows where the review has <20 words
tpilot_df = tpilot_df[tpilot_df['Review'].str.split().str.len() >= 20]
tpilot_df

Unnamed: 0,Exchange,Date,Review,Rating,Sentiment
2,binance,"March 21, 2025",I have been using this platform for few years ...,3,positive
4,binance,"January 01, 2025","Getting money in is easy, but out is ridiculou...",1,positive
5,binance,"March 20, 2025",Account disappeared. Circular nightmare trying...,1,negative
6,binance,"March 19, 2025",Do not recommend to anybody!They got celphone ...,1,negative
8,binance,"March 24, 2024",There are those people they can't make single ...,5,negative
...,...,...,...,...,...
12007,bybit,"October 21, 2022","hello, my bybit account is restricted or banne...",1,positive
12008,bybit,"May 21, 2022",I loved my time trading on this website everyt...,5,positive
12009,bybit,"October 13, 2022",I've Lost more than 900$ for a long order clos...,1,positive
12010,bybit,"October 11, 2022",Scam exchange has scammed me 114$ on p2p and n...,1,neutral


In [30]:
# Datetime handling
tpilot_df['Date'] = pd.to_datetime(tpilot_df['Date']).dt.date
tpilot_df

Unnamed: 0,Exchange,Date,Review,Rating,Sentiment
2,binance,2025-03-21,I have been using this platform for few years ...,3,positive
4,binance,2025-01-01,"Getting money in is easy, but out is ridiculou...",1,positive
5,binance,2025-03-20,Account disappeared. Circular nightmare trying...,1,negative
6,binance,2025-03-19,Do not recommend to anybody!They got celphone ...,1,negative
8,binance,2024-03-24,There are those people they can't make single ...,5,negative
...,...,...,...,...,...
12007,bybit,2022-10-21,"hello, my bybit account is restricted or banne...",1,positive
12008,bybit,2022-05-21,I loved my time trading on this website everyt...,5,positive
12009,bybit,2022-10-13,I've Lost more than 900$ for a long order clos...,1,positive
12010,bybit,2022-10-11,Scam exchange has scammed me 114$ on p2p and n...,1,neutral


In [31]:
# It has been verified that all is in English, when crawling I forgot to only accept data if its in English. But all data is in English.
'''
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False  # in case of empty or corrupt input

# Apply to the DataFrame
tpilot_df = tpilot_df[tpilot_df['Review'].apply(is_english)]
tpilot_df
'''

"\ndef is_english(text):\n    try:\n        return detect(text) == 'en'\n    except:\n        return False  # in case of empty or corrupt input\n\n# Apply to the DataFrame\ntpilot_df = tpilot_df[tpilot_df['Review'].apply(is_english)]\ntpilot_df\n"

## Aggregation

In [32]:
reddit_df['subreddit'] = 'r/' + reddit_df['subreddit'].astype(str)
reddit_df = reddit_df.rename(columns={
    'created_utc': 'Date',
    'subreddit': 'Source', # subreddit name
    'platform': 'Exchange',  # exchange or 'general'
    'text': 'Text',
    'score': 'Reddit Score',
    'sentiment': 'Sentiment'
})
reddit_df['Platform'] = 'web'
reddit_df['Rating'] = None
reddit_df

Unnamed: 0,Text,Reddit Score,Date,Source,Exchange,Sentiment,Platform,Rating
0,Not every new coin is a shitcoin: How to spot ...,20345,2021-05-11,r/CryptoCurrency,binance,positive,web,
1,Great post. I have a few questions. \n\n1) How...,12,2021-05-11,r/CryptoCurrency,binance,positive,web,
2,> Start trading on the BSC (Binance) Smart Cha...,59,2021-05-11,r/CryptoCurrency,binance,positive,web,
3,"okay so i bought into 3 coins, around $10 tota...",3,2021-05-12,r/CryptoCurrency,binance,positive,web,
4,Isn’t the minimum purchase set to $15 on binance?,7,2021-05-11,r/CryptoCurrency,binance,neutral,web,
...,...,...,...,...,...,...,...,...
40523,in my experience the Airbnbs were never that b...,2,2018-01-19,r/Ethereum,binance,negative,web,
40524,"""what is stopping AirBnB from integrating it, ...",3,2018-01-20,r/Ethereum,binance,positive,web,
40525,"There's much more to it than accepting crypto,...",2,2018-01-20,r/Ethereum,binance,positive,web,
40526,Are these some of the same reasons hosts might...,2,2018-01-20,r/Ethereum,binance,neutral,web,


In [33]:
appstore_df = appstore_df.rename(columns={
    'exchange': 'Exchange',
    'platform': 'Platform',  # already okay
    'rating': 'Rating',
    'date': 'Date',
    'text': 'Text',
    'sentiment': 'Sentiment'
})
appstore_df['Source'] = appstore_df['Platform'].apply(
    lambda x: 'app store' if x == 'ios' else 'play store'
)
appstore_df['Reddit Score'] = None
appstore_df

Unnamed: 0,Exchange,Platform,Rating,Date,Text,Sentiment,Source,Reddit Score
0,binance,ios,5,2023-10-04,How can someone set stop lose and take profit ...,negative,app store,
1,binance,ios,1,2023-05-24,"I have been using Binance for a little bit, ev...",positive,app store,
2,binance,ios,2,2021-04-11,The only thing good about the app is it’s inte...,positive,app store,
3,binance,ios,1,2023-06-15,Got an email last week saying binance is shutt...,positive,app store,
4,binance,ios,1,2021-05-05,I’ve been trying to buy crypto for a while now...,positive,app store,
...,...,...,...,...,...,...,...,...
1709,bybit,android,1,2025-03-04,"Bybit lacks many things, fast support customer...",positive,play store,
1712,bybit,android,4,2025-03-04,"I liked the app, it's really amazing, there's ...",positive,play store,
1714,bybit,android,3,2025-03-04,Bybit is a game changer in the world of crypto...,positive,play store,
1715,bybit,android,3,2025-03-04,"Not too good, not too bad, try to make it more...",positive,play store,


In [34]:
tpilot_df = tpilot_df.rename(columns={
    'Review': 'Text',
})
tpilot_df['Platform'] = 'web'
tpilot_df['Source'] = 'trustpilot'
tpilot_df['Reddit Score'] = None
tpilot_df

Unnamed: 0,Exchange,Date,Text,Rating,Sentiment,Platform,Source,Reddit Score
2,binance,2025-03-21,I have been using this platform for few years ...,3,positive,web,trustpilot,
4,binance,2025-01-01,"Getting money in is easy, but out is ridiculou...",1,positive,web,trustpilot,
5,binance,2025-03-20,Account disappeared. Circular nightmare trying...,1,negative,web,trustpilot,
6,binance,2025-03-19,Do not recommend to anybody!They got celphone ...,1,negative,web,trustpilot,
8,binance,2024-03-24,There are those people they can't make single ...,5,negative,web,trustpilot,
...,...,...,...,...,...,...,...,...
12007,bybit,2022-10-21,"hello, my bybit account is restricted or banne...",1,positive,web,trustpilot,
12008,bybit,2022-05-21,I loved my time trading on this website everyt...,5,positive,web,trustpilot,
12009,bybit,2022-10-13,I've Lost more than 900$ for a long order clos...,1,positive,web,trustpilot,
12010,bybit,2022-10-11,Scam exchange has scammed me 114$ on p2p and n...,1,neutral,web,trustpilot,


In [35]:
combined_df = pd.concat([reddit_df, appstore_df, tpilot_df], ignore_index=True)
combined_df = combined_df.drop_duplicates()
combined_df

Unnamed: 0,Text,Reddit Score,Date,Source,Exchange,Sentiment,Platform,Rating
0,Not every new coin is a shitcoin: How to spot ...,20345,2021-05-11,r/CryptoCurrency,binance,positive,web,
1,Great post. I have a few questions. \n\n1) How...,12,2021-05-11,r/CryptoCurrency,binance,positive,web,
2,> Start trading on the BSC (Binance) Smart Cha...,59,2021-05-11,r/CryptoCurrency,binance,positive,web,
3,"okay so i bought into 3 coins, around $10 tota...",3,2021-05-12,r/CryptoCurrency,binance,positive,web,
4,Isn’t the minimum purchase set to $15 on binance?,7,2021-05-11,r/CryptoCurrency,binance,neutral,web,
...,...,...,...,...,...,...,...,...
49650,"hello, my bybit account is restricted or banne...",,2022-10-21,trustpilot,bybit,positive,web,1
49651,I loved my time trading on this website everyt...,,2022-05-21,trustpilot,bybit,positive,web,5
49652,I've Lost more than 900$ for a long order clos...,,2022-10-13,trustpilot,bybit,positive,web,1
49653,Scam exchange has scammed me 114$ on p2p and n...,,2022-10-11,trustpilot,bybit,neutral,web,1


In [36]:
print(combined_df['Exchange'].unique())

['binance' 'coinbase' 'kraken' 'binance, coinbase' 'crypto.com'
 'coinbase, kraken' 'binance, crypto.com' 'binance, coinbase, crypto.com'
 'binance, coinbase, kraken' 'binance, kraken' 'crypto.com, kraken'
 'kucoin' 'binance, kucoin' 'kraken, kucoin'
 'binance, bybit, coinbase, kraken' 'binance, coinbase, kucoin'
 'coinbase, crypto.com, kraken' 'binance, coinbase, kraken, kucoin'
 'binance, crypto.com, kraken' 'coinbase, crypto.com'
 'binance, kraken, kucoin' 'coinbase, kraken, kucoin'
 'binance, coinbase, crypto.com, kraken' 'crypto.com, kucoin'
 'coinbase, crypto.com, kucoin' 'bybit' 'binance, bybit, coinbase'
 'binance, crypto.com, kucoin' 'binance, crypto.com, kraken, kucoin'
 'binance, bybit' 'binance, okx' 'coinbase, kucoin'
 'bybit, coinbase, kraken' 'binance, coinbase, crypto.com, kucoin'
 'binance, coinbase, okx' 'binance, kraken, kucoin, okx'
 'binance, crypto.com, kraken, kucoin, okx' 'bybit, coinbase, kucoin'
 'coinbase, okx' 'binance, coinbase, crypto.com, kraken, kucoin' 

In [37]:
combined_df.to_csv('../data/crypto_exchange_data_raw.csv', index=False)