In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

In [2]:
# Load the saved CSV
df = pd.read_csv('../data/crypto_exchange_data.csv')
df

Unnamed: 0,text,reddit_score,date,source,type,exchange,parent_id,platform,rating,sentiment,...,user_interface,customer_service,security,coin_listings,performance,entities,keywords,is_duplicate,id,word_count
0,I'm really starting to get scared of keeping m...,8.0,2021-03-06,r/CryptoCurrency,comment,"['binance', 'coinbase']",t3_lye4n5,web,,negative,...,0.0000,0.0000,0.0000,-0.3241,-0.3241,[],"[""really"", ""good"", ""im"", ""exchange"", ""dont"", ""...",False,1171,15
1,I actually like Binance.com for the most part ...,1.0,2021-03-06,r/CryptoCurrency,comment,['binance'],t3_lye4n5,web,,negative,...,0.0000,0.0000,0.0000,0.0000,0.0000,[],"[""use"", ""like""]",False,1379,11
2,Wow what a horror story.\n\nI was just in the ...,1.0,2021-03-06,r/CryptoCurrency,comment,['binance'],t3_lye4n5,web,,negative,...,0.0000,0.0000,0.0000,0.0000,0.0000,[],"[""think"", ""binance"", ""account""]",False,1373,10
3,"Not your keys, not your coins. If you don’t li...",1.0,2021-12-18,r/CryptoCurrency,comment,['coinbase'],t3_ritaqq,web,,negative,...,0.0000,0.0000,0.0000,-0.2760,0.0000,[],"[""dont"", ""trading"", ""ive"", ""coin"", ""issue"", ""u...",False,1504,29
4,Coinbase has forzs twice this year I know of a...,8.0,2021-10-24,r/CryptoCurrency,comment,"['coinbase', 'kucoin']",t1_hhsmnxb,web,,negative,...,0.0000,-0.6488,-0.4770,0.0000,0.2705,"['coinbase forzs', 'twice year', 'daily', 'one']","[""dont"", ""year"", ""trading"", ""card"", ""scam"", ""k...",False,5337,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47110,I’ve had a problem regarding my identity card ...,,2023-06-24,trustpilot,trustpilot review,['bybit'],,web,4.0,neutral,...,0.0000,0.7096,-0.2960,0.0000,0.0000,[],"[""problem"", ""customer"", ""support"", ""card"", ""us...",False,71785,24
47111,Seems people come here to write a review only ...,,2022-12-09,trustpilot,trustpilot review,['bybit'],,web,4.0,neutral,...,-0.0387,0.0000,0.0000,0.0000,0.0000,[],"[""didnt"", ""reason"", ""thats"", ""user"", ""month"", ...",False,71800,24
47112,Try to view my assets all I get are *****'s. W...,,2023-02-12,trustpilot,trustpilot review,['bybit'],,web,1.0,neutral,...,-0.0258,0.0000,0.0000,-0.0258,0.0000,[],"[""asset"", ""used"", ""trade"", ""platform"", ""fund""]",False,71819,17
47113,Bybit is wonderful to use and the overall func...,,2023-01-28,trustpilot,trustpilot review,['bybit'],,web,1.0,neutral,...,-0.3182,0.0000,0.3182,0.0000,0.0000,[],"[""order"", ""look"", ""bybit"", ""scam"", ""trade"", ""p...",False,71823,39


In [3]:
# Define Custom Feature Lexicons
CRYPTO_FEATURES = {
    'Fees': [
        'fee', 'fees', 'trading fee', 'withdrawal fee', 'deposit fee',
        'low fee', 'high fee', 'expensive', 'cheap', 'commission',
        'hidden fee', 'transparent pricing', 'zero fee', 'cost', 'charge',
        'markup', 'processing fee', 'transaction cost', 'gas fee'
    ],
    'User Interface': [
        'app', 'website', 'ui', 'ux', 'interface', 'design', 'layout',
        'navigation', 'bug', 'glitch', 'slow', 'fast', 'responsive',
        'usability', 'mobile app', 'dashboard', 'update', 'dark mode',
        'intuitive', 'user-friendly', 'complicated', 'laggy', 'crashes',
        'loading', 'experience', 'accessibility'
    ],
    'Customer Service': [
        'support', 'help', 'customer', 'service', 'response', 'ticket',
        'chat', 'email', 'reply', 'unresponsive', 'delay', 'resolved',
        'complaint', 'agent', 'representative', 'live chat', 'call',
        'waiting time', 'inquiry', 'feedback', 'escalation', 'not helpful',
        'ignored', 'follow-up'
    ],
    'Security': [
        'secure', 'security', 'hack', 'breach', 'phishing', '2fa',
        'safety', 'account locked', 'withdrawal lock', 'verify',
        'verification', 'suspicious activity', 'identity theft', 'scam',
        'fraud', 'authentication', 'kyc', 'malware', 'cold wallet',
        'hot wallet', 'security token', 'ddos', 'data leak'
    ],
    'Coin Listings': [
        'listed', 'coin', 'token', 'listing', 'altcoin', 'available',
        'supported', 'delist', 'new coin', 'available pairs',
        'cryptocurrency', 'asset', 'stablecoin', 'pairing', 'market pair',
        'trading pair', 'not available', 'support for', 'removed',
        'launched', 'integrated'
    ],
    'Performance': [
        'crash', 'slow', 'lag', 'error', 'fail', 'stable', 'reliable',
        'outage', 'downtime', 'performance', 'uptime', 'maintenance',
        'server issue', 'connection lost', 'timeout', 'freeze', 'buggy',
        'high latency', 'stuck', 'reboot', 'real-time', 'speed'
    ]
}

text_column = 'cleaned_text'

# Count total number of records
num_records = len(df)

# Word count per row
df['word_count'] = df[text_column].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

# Total words in corpus
total_words = df['word_count'].sum()

# Count unique words (types)
all_words = ' '.join(df[text_column].fillna('').astype(str)).split()
unique_words = set(all_words)
num_types = len(unique_words)

# Get categorical distributions
distributions = {}
for column in ['source', 'sentiment', 'platform', 'exchange']:
    if column in df.columns:
        distributions[f"{column}_distribution"] = df[column].value_counts().to_dict()

# Get feature score stats
feature_stats = {}
for feature in CRYPTO_FEATURES.keys():
    if feature in df.columns:
        feature_stats[feature] = {
            'mean': round(df[feature].mean(), 4),
            'median': round(df[feature].median(), 4),
            'std': round(df[feature].std(), 4),
            'min': round(df[feature].min(), 4),
            'max': round(df[feature].max(), 4)
        }

# Get text length stats
text_length_stats = {
    'mean_word_count': df['word_count'].mean(),
    'median_word_count': df['word_count'].median(),
    'min_word_count': df['word_count'].min(),
    'max_word_count': df['word_count'].max()
}

# Compile full statistics summary
statistics = {
    'num_records': num_records,
    'total_words': total_words,
    'num_unique_words': num_types,
    'lexical_diversity': num_types / total_words if total_words > 0 else 0,
    'distributions': distributions,
    'feature_stats': feature_stats,
    'text_length_stats': text_length_stats
}

In [4]:
# --- Display Output ---

# Overview
print("=== Dataset Overview ===")
pprint({
    'Total Records': statistics['num_records'],
    'Total Words': statistics['total_words'],
    'Unique Words': statistics['num_unique_words'],
    'Lexical Diversity': round(statistics['lexical_diversity'], 4)
})

# Distributions
print("\n=== Categorical Distributions ===")
for name, dist in statistics['distributions'].items():
    print(f"\n{name.replace('_', ' ').title()}:")
    for label, count in dist.items():
        print(f"  {label}: {count}")

# Text length stats
print("\n=== Text Length Statistics ===")
for stat, val in statistics['text_length_stats'].items():
    print(f"{stat.replace('_', ' ').title()}: {round(val, 2)}")

# Feature score stats (DataFrame)
if feature_stats:
    feature_df = pd.DataFrame(statistics['feature_stats']).T
    feature_df = feature_df[['mean', 'median', 'std', 'min', 'max']].round(3)
    print("\n=== Feature Sentiment Scores ===")
    display(feature_df)

# Optional: Plot sentiment distribution if available
if 'Sentiment_distribution' in statistics['distributions']:
    pd.Series(statistics['distributions']['Sentiment_distribution']).plot(kind='bar', title='Sentiment Distribution', ylabel='Count')
    plt.show()

=== Dataset Overview ===
{'Lexical Diversity': 0.0295,
 'Total Records': 47115,
 'Total Words': 1675692,
 'Unique Words': 49486}

=== Categorical Distributions ===

Source Distribution:
  r/CoinBase: 18871
  trustpilot: 8174
  r/binance: 4415
  r/CryptoCurrency: 4221
  app store: 2833
  r/kucoin: 1390
  r/Crypto_com: 1267
  r/BitcoinBeginners: 1201
  play store: 1152
  r/Bybit: 1068
  r/Kraken: 1065
  r/Ethereum: 509
  r/CryptoScams: 396
  r/OKX: 304
  r/CryptoMarkets: 176
  r/CryptoTechnology: 73

Sentiment Distribution:
  negative: 20311
  positive: 20311
  neutral: 6493

Platform Distribution:
  web: 43130
  ios: 2833
  android: 1152

Exchange Distribution:
  ['coinbase']: 21293
  ['binance']: 8602
  ['kraken']: 4163
  ['crypto.com']: 3909
  ['kucoin']: 2639
  ['bybit']: 2513
  ['okx']: 1241
  ['binance', 'coinbase']: 897
  ['coinbase', 'kraken']: 475
  ['binance', 'kucoin']: 199
  ['binance', 'kraken']: 193
  ['coinbase', 'crypto.com']: 186
  ['binance', 'coinbase', 'kraken']: 175


In [6]:
print(df.columns)

Index(['text', 'reddit_score', 'date', 'source', 'type', 'exchange',
       'parent_id', 'platform', 'rating', 'sentiment', 'cleaned_text',
       'sentiment_score', 'fees', 'user_interface', 'customer_service',
       'security', 'coin_listings', 'performance', 'entities', 'keywords',
       'is_duplicate', 'id', 'word_count'],
      dtype='object')
