In [None]:
%%capture
!pip install --user -U pip setuptools wheel
!pip install --user -U nltk swifter spacy
!python -m spacy download en_core_web_sm

In [None]:
import re
import nltk
import json
import hashlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import swifter
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)

nlp = spacy.load("en_core_web_sm")

In [None]:
# Download required NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('vader_lexicon', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('maxent_ne_chunker', quiet=True)
nltk.download('words', quiet=True)

In [None]:
# Setup Platform Dictionary
CRYPTO_EXCHANGES = {
    'binance': ['binance', 'bnb', 'binance us', 'binance app', 'binance exchange'],
    'coinbase': ['coinbase', 'coinbase pro', 'coinbase wallet', 'cb wallet'],
    'kraken': ['kraken', 'kraken exchange', 'kraken pro'],
    'okx': ['okx', 'okex'],
    'kucoin': ['kucoin', 'kucoin exchange'],
    'crypto.com': ['crypto.com', 'cro', 'crypto.com app', 'cdc'],
    'bybit': ['bybit', 'bybit app']
}

In [None]:
# Define Custom Feature Lexicons
CRYPTO_FEATURES = {
    'Fees': [
        'fee', 'fees', 'trading fee', 'withdrawal fee', 'deposit fee',
        'low fee', 'high fee', 'expensive', 'cheap', 'commission',
        'hidden fee', 'transparent pricing', 'zero fee', 'cost', 'charge',
        'markup', 'processing fee', 'transaction cost', 'gas fee'
    ],
    'User Interface': [
        'app', 'website', 'ui', 'ux', 'interface', 'design', 'layout',
        'navigation', 'bug', 'glitch', 'slow', 'fast', 'responsive',
        'usability', 'mobile app', 'dashboard', 'update', 'dark mode',
        'intuitive', 'user-friendly', 'complicated', 'laggy', 'crashes',
        'loading', 'experience', 'accessibility'
    ],
    'Customer Service': [
        'support', 'help', 'customer', 'service', 'response', 'ticket',
        'chat', 'email', 'reply', 'unresponsive', 'delay', 'resolved',
        'complaint', 'agent', 'representative', 'live chat', 'call',
        'waiting time', 'inquiry', 'feedback', 'escalation', 'not helpful',
        'ignored', 'follow-up'
    ],
    'Security': [
        'secure', 'security', 'hack', 'breach', 'phishing', '2fa',
        'safety', 'account locked', 'withdrawal lock', 'verify',
        'verification', 'suspicious activity', 'identity theft', 'scam',
        'fraud', 'authentication', 'kyc', 'malware', 'cold wallet',
        'hot wallet', 'security token', 'ddos', 'data leak'
    ],
    'Coin Listings': [
        'listed', 'coin', 'token', 'listing', 'altcoin', 'available',
        'supported', 'delist', 'new coin', 'available pairs',
        'cryptocurrency', 'asset', 'stablecoin', 'pairing', 'market pair',
        'trading pair', 'not available', 'support for', 'removed',
        'launched', 'integrated'
    ],
    'Performance': [
        'crash', 'slow', 'lag', 'error', 'fail', 'stable', 'reliable',
        'outage', 'downtime', 'performance', 'uptime', 'maintenance',
        'server issue', 'connection lost', 'timeout', 'freeze', 'buggy',
        'high latency', 'stuck', 'reboot', 'real-time', 'speed'
    ]
}

In [None]:
# Core NLP Components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
sentiment_analyzer = SentimentIntensityAnalyzer()

# Set processing parameters
min_text_length = 5
duplicate_threshold = 0.8

In [None]:
# Misc Functions
def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Tokenize and remove stopwords
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Join tokens back to text
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [None]:
def analyze_sentiment(text):
        if not text:
            return 'neutral', 0.0

        scores = sentiment_analyzer.polarity_scores(text)
        compound_score = scores['compound']

        # Convert score to sentiment category
        if compound_score >= 0.05:
            sentiment = 'positive'
        elif compound_score <= -0.05:
            sentiment = 'negative'
        else:
            sentiment = 'neutral'

        return sentiment, compound_score

In [None]:
def extract_features(text):
    if not text:
        return {feature: 0.0 for feature in CRYPTO_FEATURES}

    text = text.lower()
    feature_scores = {}

    for feature, keywords in CRYPTO_FEATURES.items():
        feature_phrases = [kw for kw in keywords if kw in text]
        feature_text = ""

        for phrase in feature_phrases:
            # Match ~100 character windows around phrase
            pattern = r'.{0,100}' + re.escape(phrase) + r'.{0,100}'
            matches = re.findall(pattern, text)
            feature_text += ' '.join(matches) + ' '

        if feature_text:
            _, sentiment_score = analyze_sentiment(feature_text)
            feature_scores[feature] = sentiment_score
        else:
            feature_scores[feature] = 0.0

    return feature_scores

In [None]:
def extract_entities(text):
    if not text or len(text) < 20:
        return []
    try:
        doc = nlp(text)
        return [{'text': ent.text, 'type': ent.label_} for ent in doc.ents]
    except:
        return []

In [None]:
def extract_keywords(tfidf_row, feature_names, top_n=10):
    try:
        scores = tfidf_row.toarray().flatten()
        top_indices = scores.argsort()[::-1][:top_n]
        return [feature_names[i] for i in top_indices if scores[i] > 0]
    except Exception as e:
        print(f"Error extracting keywords: {e}")
        return []

In [None]:
tqdm.pandas() 
def preprocess(df):
    print("Starting comprehensive preprocessing...")

    # Clean text
    if 'Text' in df.columns:
        print("Cleaning text...")
        df['Cleaned Text'] = df['Text'].apply(clean_text)

    # Filter short texts
    print("Filtering out short texts...")
    df = df[df['Cleaned Text'].str.split().str.len() > min_text_length].copy()

    # Sentiment analysis
    print("Analyzing sentiment...")
    sentiment_results = df['Cleaned Text'].apply(analyze_sentiment)
    df['Sentiment'] = sentiment_results.apply(lambda x: x[0])
    df['Sentiment Score'] = sentiment_results.apply(lambda x: x[1])

    # Feature scores
    print("Extracting feature scores...")
    feature_scores = df['Cleaned Text'].swifter.apply(extract_features)
    for feature in CRYPTO_FEATURES.keys():
        df[feature] = feature_scores.apply(lambda x: x.get(feature, 0.0))

    # Extract keywords using TF-IDF (fit once)
    print("Fitting TF-IDF vectorizer...")
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=100)
    X = vectorizer.fit_transform(df['Cleaned Text'])
    feature_names = vectorizer.get_feature_names_out()

    print("Extracting entities with progress bar...")
    df['Entities'] = df['Cleaned Text'].progress_apply(lambda x: json.dumps(extract_entities(x)))
    
    print("Extracting keywords with progress bar...")
    df['Keywords'] = [json.dumps(extract_keywords(X[i], feature_names)) for i in tqdm(range(X.shape[0]))]


    print(f"Preprocessing complete. {len(df)} records remain.")
    return df

In [None]:
def detect_duplicates(df, text_column='Cleaned Text', threshold=None):
        threshold = threshold or duplicate_threshold
        print(f"Detecting near-duplicate content with threshold {threshold}...")

        # Get non-empty texts
        texts = df[text_column].dropna().tolist()
        if len(texts) < 2:
            return df

        # Create TF-IDF matrix
        tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=0.95)
        tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

        # Calculate pairwise similarity
        duplicate_indices = set()

        # For large datasets, process in batches to avoid memory issues
        batch_size = 1000
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_end = min(i + batch_size, len(texts))
            batch_matrix = tfidf_matrix[i:batch_end]

            # Calculate cosine similarity between this batch and all documents
            similarities = cosine_similarity(batch_matrix, tfidf_matrix)

            # Find duplicates
            for batch_idx, sim_scores in enumerate(similarities):
                doc_idx = i + batch_idx
                # Find similar documents (excluding self-comparison)
                similar_indices = np.where(sim_scores > threshold)[0]

                for similar_idx in similar_indices:
                    if similar_idx != doc_idx and similar_idx > doc_idx:
                        # Keep the document with more content or higher engagement
                        if len(texts[doc_idx]) < len(texts[similar_idx]):
                            duplicate_indices.add(doc_idx)
                        else:
                            duplicate_indices.add(similar_idx)

        # Create a duplicate flag
        df['is_duplicate'] = df.index.isin(duplicate_indices)

        # Filter out duplicates
        df_no_duplicates = df[~df['is_duplicate']]

        print(f"Removed {len(duplicate_indices)} duplicate records. {len(df_no_duplicates)} records remaining.")
        return df_no_duplicates

In [None]:
df = pd.read_csv('crypto_exchange_data_raw.csv')

# Use Vader Sentiment Analysis instead of TextBlob's
df = df.drop(['Sentiment'], axis=1)
df

In [None]:
# Preprocess data (includes cleaning, sentiment analysis, etc.)
processed_df = preprocess(df)
processed_df

In [None]:
# Remove duplicates
deduplicated_df = detect_duplicates(processed_df)
deduplicated_df

In [None]:
deduplicated_df = deduplicated_df.fillna('')  # to avoid nulls when exporting to solr

deduplicated_df['ID'] = deduplicated_df.index.astype(str) # solr needs id column in str not int

In [None]:
deduplicated_df.to_csv('../data/crypto_exchange_data_preprocessed.csv', index=False)