In [None]:
pip install numpy==1.24 scipy==1.13 smart_open


In [None]:
pip install gensim==4.3.3

In [None]:
pip install pandas

In [None]:
pip install nltk

In [None]:
pip install transformers

In [None]:
pip install torch

In [None]:
pip install scikit-learn

In [None]:
pip install imbalanced-learn


In [None]:
pip install nlpaug


In [None]:
pip install sentencepiece

In [None]:
#python 3.9
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
import os
import pickle
from sklearn.utils.class_weight import compute_class_weight
import nlpaug.augmenter.word as naw
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords, opinion_lexicon

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('opinion_lexicon')
nltk.download('averaged_perceptron_tagger')


In [None]:
# Enhanced sentiment-aware text cleaning
def enhanced_sentiment_clean(text):
    if not isinstance(text, str):
        return ""
    
    # Save emoticons - they contain strong sentiment signals
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    # Keep hashtags content but remove # symbol
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Handle contractions
    text = re.sub(r"can\'t", "cannot", text)
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"n\'t", " not", text)
    
    # Handle negations with special marking
    text = re.sub(r'\b(?:not|no|never|n\'t)\b[\s]+(\w+)', r'NOT_\1', text)
    
    # Convert to lowercase but preserve elongated words (they indicate emphasis)
    text = text.lower().strip()
    
    # Mark repeated punctuation (often indicates intensity)
    text = re.sub(r'([!?.]){2,}', r'\1 <EMPHASIS>', text)
    
    # Preserve elongated words as they indicate emphasis
    # Then reduce but mark them
    text = re.sub(r'(\w)(\1{2,})', r'\1\1 <ELONGATED>', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Reinsert emoticons
    if emoticons:
        text += ' ' + ' '.join(emoticons)
    
    return text

# Process neutral samples specifically to improve class distinction
def process_neutral_samples(text):
    """Special preprocessing for neutral class samples to enhance their distinction"""
    # Remove intensifiers that might introduce sentiment bias
    text = re.sub(r'\b(very|really|extremely|quite)\b', '', text)
    # Normalize sentiment-laden words
    text = re.sub(r'\b(good|great|excellent|amazing)\b', 'positive_term', text)
    text = re.sub(r'\b(bad|terrible|awful|horrible)\b', 'negative_term', text)
    return text

# Sentiment-preserving augmentation
sia = SentimentIntensityAnalyzer()
def sentiment_preserving_augmentation(text, sentiment_class, max_attempts=5):
    """Augment text while preserving sentiment"""
    if not text or len(text) < 10:
        return text
        
    # Initialize VADER sentiment analyzer
    original_scores = sia.polarity_scores(text)
    
    # Initialize augmenters
    synonym_aug = naw.SynonymAug(aug_src='wordnet')
    random_aug = naw.RandomWordAug(action="swap")
    
    for attempt in range(max_attempts):
        try:
            # Apply augmentation - first synonyms, then word swapping
            augmented = synonym_aug.augment(text)
            augmented = random_aug.augment(augmented)
            
            # Check sentiment preservation
            new_scores = sia.polarity_scores(augmented)
            
            # If compound scores are within 0.2, sentiment is likely preserved
            if abs(original_scores['compound'] - new_scores['compound']) < 0.2:
                return augmented
        except Exception as e:
            continue
    
    # If all attempts fail, return original
    return text


In [None]:
# Enhanced tokenization for preserving sentiment-relevant text portions
def tokenize_with_sentiment_focus(texts, tokenizer, max_length=128):
    """Tokenize with focus on beginning and end of text (where sentiment often appears)"""
    results = {'input_ids': [], 'attention_mask': []}
    
    for text in texts:
        try:
            tokens = tokenizer.tokenize(text)
            if len(tokens) <= max_length - 2:
                encoding = tokenizer(
                    text,
                    padding='max_length',
                    truncation=True,
                    max_length=max_length,
                    return_tensors='np'
                )
                results['input_ids'].append(encoding['input_ids'][0])
                results['attention_mask'].append(encoding['attention_mask'][0])
            else:
                # For longer texts
                first_chunk = tokens[:max_length//2 - 1]
                last_chunk = tokens[-(max_length//2 - 1):]
                
                # Create combined tokens with special tokens
                combined_tokens = [tokenizer.cls_token] + first_chunk + [tokenizer.sep_token] + last_chunk + [tokenizer.sep_token]
                combined_ids = tokenizer.convert_tokens_to_ids(combined_tokens)
                attention_mask = [1] * len(combined_ids)
                
                # Pad to max_length
                pad_length = max_length - len(combined_ids)
                combined_ids = combined_ids + [tokenizer.pad_token_id] * pad_length
                attention_mask = attention_mask + [0] * pad_length
                
                # Truncate
                combined_ids = combined_ids[:max_length]
                attention_mask = attention_mask[:max_length]
                
                results['input_ids'].append(np.array(combined_ids))
                results['attention_mask'].append(np.array(attention_mask))
        except Exception as e:
            # Fallback for errors
            print(f"Error tokenizing text: {e}")
            # Use empty padding
            empty_input_ids = np.array([tokenizer.cls_token_id] + [tokenizer.pad_token_id] * (max_length - 2) + [tokenizer.sep_token_id])
            empty_attention_mask = np.array([1] + [0] * (max_length - 2) + [1])
            results['input_ids'].append(empty_input_ids)
            results['attention_mask'].append(empty_attention_mask)
    
    return {
        'input_ids': np.array(results['input_ids']),
        'attention_mask': np.array(results['attention_mask'])
    }

# LDA feature extraction with better convergence
def extract_lda_features(train_texts, val_texts, test_texts, n_topics=15, random_state=42):
    """Extract LDA topic distributions from text data with improved convergence"""
    print(f"\nExtracting LDA topic features (n_topics={n_topics})...")
    print("Creating document-term matrix...")
    
    # Get sentiment words
    try:
        positive_words = set(opinion_lexicon.positive())
        negative_words = set(opinion_lexicon.negative())
        sentiment_words = positive_words.union(negative_words)
        print(f"Loaded {len(sentiment_words)} sentiment words from lexicon")
    except:
        sentiment_words = set()
        print("Warning: Could not load sentiment lexicon")
    
    # Create vectorizer with expanded vocabulary
    count_vectorizer = CountVectorizer(
        max_df=0.95, 
        min_df=2, 
        stop_words='english', 
        max_features=2500
    )
    
    train_dtm = count_vectorizer.fit_transform(train_texts)
    
    # Transform validation and test texts
    val_dtm = count_vectorizer.transform(val_texts)
    test_dtm = count_vectorizer.transform(test_texts)
    
    # Fit LDA model on training data only with improved settings
    print("Fitting LDA model...")
    lda_model = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=random_state,
        max_iter=50,  
        n_jobs=-1,    
        evaluate_every=5,
        learning_method='online',
        learning_offset=50.0,
        batch_size=128,
        verbose=1  
    )
    lda_model.fit(train_dtm)
    
    # Transform data to get topic distributions
    print("Extracting topic distributions...")
    train_topic_dists = lda_model.transform(train_dtm)
    val_topic_dists = lda_model.transform(val_dtm)
    test_topic_dists = lda_model.transform(test_dtm)
    
    # Print top words for each topic
    feature_names = count_vectorizer.get_feature_names_out()
    print("\nTop 10 words for each topic:")
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words_idx = topic.argsort()[:-11:-1]  
        top_words = [feature_names[i] for i in top_words_idx]
        print(f"Topic #{topic_idx}: {', '.join(top_words)}")
    
    # Print log-likelihood to evaluate convergence
    print(f"Final log-likelihood: {lda_model.score(train_dtm)}")
    
    # Save LDA model and vectorizer for later use
    with open('lda_model.pkl', 'wb') as f:
        pickle.dump(lda_model, f)
    with open('count_vectorizer.pkl', 'wb') as f:
        pickle.dump(count_vectorizer, f)
        
    print(f"LDA extraction complete. Feature shape: {train_topic_dists.shape[1]} topics per document")
    
    return train_topic_dists, val_topic_dists, test_topic_dists, lda_model, count_vectorizer


In [None]:
def main():
    # Define data directories using proper path handling
    DATA_DIR = 'raw data'
    print("Loading datasets...")
    try:
        # Load Twitter dataset
        twitter_train = pd.read_csv(os.path.join(DATA_DIR, 'twitter_training.csv'), header=None, 
                                   names=['Tweet id','topic', 'sentiment','Tweet content'])
        twitter_val = pd.read_csv(os.path.join(DATA_DIR, 'twitter_validation.csv'), header=None, 
                                 names=['Tweet id','topic', 'sentiment','Tweet content'])
        twitter = pd.concat([twitter_train, twitter_val], ignore_index=True)
        
        # Load climate dataset
        climate_text = pd.read_csv(os.path.join(DATA_DIR, 'twitter_sentiment_data.csv'))
        
        # Load YouTube dataset
        youtube_comments = pd.read_csv(os.path.join(DATA_DIR, 'YoutubeCommentsDataSet.csv'))

        # Drop irrelevant rows from Twitter
        twitter = twitter[twitter['sentiment'] != 'Irrelevant']
        
        # Map YouTube labels to standardized format
        capital = {
            'positive': 'Positive',
            'negative': 'Negative',
            'neutral': 'Neutral'
        }
        youtube_comments['Sentiment'] = youtube_comments['Sentiment'].map(capital)
        
        sentiment_map = {
            -1: 'Negative',
            0: 'Neutral',
            1: 'Positive'
        }
        climate_text['sentiment'] = climate_text['sentiment'].map(sentiment_map)

        # Remove problem entries
        climate_text = climate_text[climate_text['sentiment'] != 2]
        
        print("Applying enhanced sentiment-aware text cleaning...")
        # Apply enhanced preprocessing to better maintain sentiment markers
        twitter['text'] = twitter['Tweet content'].apply(enhanced_sentiment_clean)
        youtube_comments['text'] = youtube_comments['Comment'].apply(enhanced_sentiment_clean)
        climate_text['text'] = climate_text['message'].apply(enhanced_sentiment_clean)
        
        # Prepare subsets with standardized column names
        twitter_subset = twitter[['text', 'sentiment']]
        youtube_subset = youtube_comments[['text', 'Sentiment']].rename(columns={'Sentiment': 'sentiment'})
        climate_subset = climate_text[['text', 'sentiment']]
        
        # Combine all datasets
        print("Concatenating datasets...")
        combined_data = pd.concat([twitter_subset, youtube_subset, climate_subset], ignore_index=True)

        # Remove empty texts and duplicates
        combined_data = combined_data[combined_data['text'].str.len() > 5].drop_duplicates()

        # ADDED: Identify and drop rows with NaN sentiment values
        nan_sentiment = combined_data[combined_data['sentiment'].isna()]
        print(f"Found {len(nan_sentiment)} rows with NaN sentiment values")
        if len(nan_sentiment) > 0:
            print("Sample rows with NaN sentiment:")
            print(nan_sentiment.head())

        # Drop rows with NaN sentiment
        combined_data = combined_data.dropna(subset=['sentiment'])
        print(f"After dropping NaNs: {len(combined_data)} rows remain")

        # Apply class-specific preprocessing for neutral class
        print("Applying class-specific preprocessing for neutral class...")
        neutral_mask = combined_data['sentiment'] == 'Neutral'
        combined_data.loc[neutral_mask, 'text'] = combined_data.loc[neutral_mask, 'text'].apply(process_neutral_samples)
        
        # Encode sentiment labels
        label_encoder = LabelEncoder()
        combined_data['sentiment_encoded'] = label_encoder.fit_transform(combined_data['sentiment'])
        sentiment_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
        print("Sentiment mapping:", sentiment_mapping)
        
        # Analyze class distribution
        class_distribution = combined_data['sentiment_encoded'].value_counts()
        print("\nClass distribution before handling imbalance:")
        print(class_distribution)
        print(f"Class ratios: {class_distribution / len(combined_data)}")
        
        # Split data into train/val/test sets (before augmentation to prevent data leakage)
        print("\nCreating data splits...")
        train_data, test_data = train_test_split(
            combined_data, test_size=0.2, random_state=42, 
            stratify=combined_data['sentiment_encoded']
        )
        train_data, val_data = train_test_split(
            train_data, test_size=0.25, random_state=42, 
            stratify=train_data['sentiment_encoded']
        )
        
        print(f"Original train set: {len(train_data)} samples")
        print(f"Validation set: {len(val_data)} samples")
        print(f"Test set: {len(test_data)} samples")
        
        # Extract LDA features at multiple granularities
        print("\nExtracting LDA features at multiple granularities...")
        lda_topic_ranges = [15, 25]  
        
        # Extract features for the default number of topics
        lda_n_topics = 15  # Default number of topics
        train_topic_dists, val_topic_dists, test_topic_dists, lda_model, count_vectorizer = extract_lda_features(
            train_data['text'].tolist(), 
            val_data['text'].tolist(), 
            test_data['text'].tolist(), 
            n_topics=lda_n_topics
        )
        
        # Store additional LDA features with different topic counts
        for n_topics in lda_topic_ranges:
            if n_topics != lda_n_topics:  
                print(f"\nExtracting LDA features with {n_topics} topics...")
                train_topics_alt, val_topics_alt, test_topics_alt, _, _ = extract_lda_features(
                    train_data['text'].tolist(), 
                    val_data['text'].tolist(), 
                    test_data['text'].tolist(), 
                    n_topics=n_topics
                )
                # Save these additional features
                np.save(f'train_lda_topics_{n_topics}.npy', train_topics_alt)
                np.save(f'val_lda_topics_{n_topics}.npy', val_topics_alt)
                np.save(f'test_lda_topics_{n_topics}.npy', test_topics_alt)
        
        # Add LDA topic distributions as features to each dataset
        for i in range(lda_n_topics):
            train_data[f'topic_{i}'] = train_topic_dists[:, i]
            val_data[f'topic_{i}'] = val_topic_dists[:, i]
            test_data[f'topic_{i}'] = test_topic_dists[:, i]
        
        # Save topic distributions separately
        np.save('train_lda_topics.npy', train_topic_dists)
        np.save('val_lda_topics.npy', val_topic_dists)
        np.save('test_lda_topics.npy', test_topic_dists)
        
        # Handle class imbalance
        print("\nHandling class imbalance...")
        
        # 1. Calculate class distribution in training data
        train_class_dist = train_data['sentiment_encoded'].value_counts()
        print("Train set class distribution before balancing:")
        print(train_class_dist)
        
        majority_class = train_class_dist.idxmax()
        minority_classes = [cls for cls in train_class_dist.index if cls != majority_class]
        
        # 3. Apply sentiment-preserving text augmentation for minority classes
        augmented_rows = []
        
        print("Applying sentiment-preserving text augmentation for minority classes...")
        for minority_class in minority_classes:
            # Get samples from minority class
            minority_samples = train_data[train_data['sentiment_encoded'] == minority_class]
            
            # Calculate how many more samples needed to reach balanced ratio (80% of majority)
            target_count = int(train_class_dist[majority_class] * 0.8)  
            samples_needed = min(target_count - len(minority_samples), len(minority_samples))
            samples_needed = max(0, samples_needed) 
            
            if samples_needed > 0:
                print(f"Class {minority_class}: Generating {samples_needed} augmented samples")
                
                # Random sampling with replacement if we need more samples than available
                if samples_needed > len(minority_samples):
                    # Multiple rounds of augmentation may be needed
                    augmentation_rounds = (samples_needed // len(minority_samples)) + 1
                    
                    for round_idx in range(augmentation_rounds):
                        # Process all minority samples in this round
                        for idx, row in minority_samples.iterrows():
                            # Skip if we've generated enough samples
                            if len(augmented_rows) >= samples_needed:
                                break
                                
                            original_text = row['text']
                            sentiment_class = row['sentiment']
                            
                            # Apply sentiment-preserving augmentation
                            augmented_text = sentiment_preserving_augmentation(
                                original_text, 
                                sentiment_class
                            )
                            
                            # Skip if augmentation didn't change the text
                            if augmented_text == original_text:
                                continue
                            
                            # Create new row with augmented text
                            new_row = row.copy()
                            new_row['text'] = augmented_text
                            
                            # Generate new LDA topics for augmented text
                            # We need to do this to ensure LDA features are consistent
                            augmented_dtm = count_vectorizer.transform([augmented_text])
                            augmented_topics = lda_model.transform(augmented_dtm)[0]
                            
                            # Add topic distributions to the new row
                            for topic_idx in range(lda_n_topics):
                                new_row[f'topic_{topic_idx}'] = augmented_topics[topic_idx]
                                
                            augmented_rows.append(new_row)
                else:
                    # Randomly select samples to augment
                    samples_to_augment = minority_samples.sample(n=samples_needed, random_state=42)
                    
                    for idx, row in samples_to_augment.iterrows():
                        original_text = row['text']
                        sentiment_class = row['sentiment']
                        
                        # Apply sentiment-preserving augmentation
                        augmented_text = sentiment_preserving_augmentation(
                            original_text, 
                            sentiment_class
                        )
                        
                        # Skip if augmentation didn't change the text
                        if augmented_text == original_text:
                            continue
                        
                        # Create new row with augmented text
                        new_row = row.copy()
                        new_row['text'] = augmented_text
                        
                        # Generate new LDA topics for augmented text
                        augmented_dtm = count_vectorizer.transform([augmented_text])
                        augmented_topics = lda_model.transform(augmented_dtm)[0]
                        
                        # Add topic distributions to the new row
                        for topic_idx in range(lda_n_topics):
                            new_row[f'topic_{topic_idx}'] = augmented_topics[topic_idx]
                            
                        augmented_rows.append(new_row)
        
        # Create DataFrame with augmented samples and concatenate with original training data
        if augmented_rows:
            augmented_df = pd.DataFrame(augmented_rows)
            augmented_train_data = pd.concat([train_data, augmented_df], ignore_index=True)
            print(f"After text augmentation, train set size: {len(augmented_train_data)} samples")
            
            # Check class distribution after text augmentation
            print("Class distribution after text augmentation:")
            print(augmented_train_data['sentiment_encoded'].value_counts())
        else:
            augmented_train_data = train_data.copy()
        
        # Apply SMOTE to topic distributions, not raw text
        print("\nApplying SMOTE to topic distributions for better coherence...")
        topic_columns = [col for col in augmented_train_data.columns if col.startswith('topic_')]
        X_topics = augmented_train_data[topic_columns].values
        y = augmented_train_data['sentiment_encoded'].values
        
        # Apply SMOTE
        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X_topics, y)
        
        # Get original samples - SMOTE preserves them at the beginning
        original_count = len(augmented_train_data)
        original_indices = list(range(min(original_count, X_resampled.shape[0])))
        
        # Create a new DataFrame with the original samples
        smote_train_data = augmented_train_data.iloc[original_indices].copy()
        
        # Create synthetic samples and update with SMOTE topic distributions
        synthetic_count = len(y_resampled) - len(original_indices)
        
        if synthetic_count > 0:
            print(f"Creating {synthetic_count} synthetic samples with SMOTE topic distributions")
            synthetic_rows = []
            
            # For each synthetic sample needed
            for outer_idx in range(synthetic_count):
                # Find the class of this synthetic sample
                synthetic_class = y_resampled[len(original_indices) + outer_idx]
                synthetic_topics = X_resampled[len(original_indices) + outer_idx]
                
                # Find a random real sample of this class
                real_samples_of_class = augmented_train_data[augmented_train_data['sentiment_encoded'] == synthetic_class]
                
                if len(real_samples_of_class) > 0:
                    # Select a random sample
                    sample_idx = real_samples_of_class.sample(1).index[0]
                    sample_row = augmented_train_data.loc[sample_idx]
                    
                    # Apply sentiment-preserving augmentation
                    original_text = sample_row['text']
                    sentiment_class = sample_row['sentiment']
                    augmented_text = sentiment_preserving_augmentation(original_text, sentiment_class)
                    
                    # Create new row
                    new_row = sample_row.copy()
                    new_row['text'] = augmented_text
                    
                    # Add the SMOTE-generated topic distributions
                    for i, col in enumerate(topic_columns):
                        new_row[col] = synthetic_topics[i]
                        
                    synthetic_rows.append(new_row)
            
            # Create DataFrame with synthetic samples
            if synthetic_rows:
                synthetic_df = pd.DataFrame(synthetic_rows)
                # Combine with sampled originals
                smote_train_data = pd.concat([smote_train_data, synthetic_df], ignore_index=True)
            
        # Final training data is the SMOTE-balanced dataset
        train_data = smote_train_data
        print(f"Final balanced train set size: {len(train_data)} samples")
        
        # 4. Calculate final class distribution
        final_class_dist = train_data['sentiment_encoded'].value_counts()
        print("\nFinal class distribution after balancing:")
        print(final_class_dist)
        print(f"Final class ratios: {final_class_dist / len(train_data)}")
        
        # 5. Calculate class weights for model training
        class_weights = compute_class_weight(
            'balanced', 
            classes=np.unique(train_data['sentiment_encoded']), 
            y=train_data['sentiment_encoded']
        )
        class_weight_dict = dict(zip(np.unique(train_data['sentiment_encoded']), class_weights))
        print("\nComputed class weights:", class_weight_dict)
        
        # Save class weights for model training
        with open('class_weights.pkl', 'wb') as f:
            pickle.dump(class_weight_dict, f)
        
        # Save datasets
        print("Saving preprocessed data...")
        train_data.to_csv('train_data_balanced.csv', index=False)
        val_data.to_csv('val_data.csv', index=False)
        test_data.to_csv('test_data.csv', index=False)
        
        # Pre-tokenize using Albert with enhanced tokenization
        print("Pre-tokenizing data with sentiment-aware tokenization...")
        tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
        
        for name, dataset in [("train", train_data), ("val", val_data), ("test", test_data)]:
            print(f"Pre-tokenizing {name} set with sentiment focus...")
            
            # Use enhanced tokenization that preserves beginning and end portions
            encodings = tokenize_with_sentiment_focus(
                dataset['text'].tolist(),
                tokenizer,
                max_length=128
            )
            
            # Save tokenized data as numpy arrays
            np.save(f'{name}_input_ids.npy', encodings['input_ids'])
            np.save(f'{name}_attention_mask.npy', encodings['attention_mask'])
            np.save(f'{name}_labels.npy', dataset['sentiment_encoded'].values)
            
            # Also save LDA features separately for each split
            topic_columns = [col for col in dataset.columns if col.startswith('topic_')]
            if topic_columns:
                np.save(f'{name}_lda_features.npy', dataset[topic_columns].values)
        
        # Save label encoder for inference
        with open('sentiment_encoder.pkl', 'wb') as f:
            pickle.dump(label_encoder, f)
            
        print("Preprocessing complete with enhanced sentiment preservation, LDA feature extraction, and class balancing!")
        
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        import traceback
        traceback.print_exc()
        raise

if __name__ == "__main__":
    main()
