<a href="https://colab.research.google.com/github/LaZy-Wolf/akki/blob/main/Untitled29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [4]:
import pandas as pd
import re
import nltk
nltk.download('punkt_tab')
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import logging
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

try:
    import torch
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not available. Using CPU for inference.")

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
except:
    logger.warning("Some NLTK downloads failed")

class ImprovedSentimentAnalyzer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.vader_analyzer = SentimentIntensityAnalyzer()
        self.bert_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
        self.tokenizer = AutoTokenizer.from_pretrained(self.bert_model_name)
        device = 0 if (TORCH_AVAILABLE and torch.cuda.is_available()) else -1
        self.bert_classifier = pipeline(
            "sentiment-analysis",
            model=self.bert_model_name,
            tokenizer=self.tokenizer,
            device=device
        )
        custom_stop_words = {'nep', 'policy', 'education', 'tweet', 'twitter'}
        self.stop_words.update(custom_stop_words)

    def advanced_clean_tweet(self, text):
        if pd.isna(text):
            return ""
        text = str(text)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#(\w+)', r'\1', text)
        text = re.sub(r'[😀-🙏]', ' positive_emoji ', text)
        text = re.sub(r'[😞-😢]', ' negative_emoji ', text)
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.lower().strip()
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                 if token not in self.stop_words and len(token) > 2]
        return ' '.join(tokens)

    def get_vader_sentiment(self, text):
        scores = self.vader_analyzer.polarity_scores(text)
        compound = scores['compound']
        if compound >= 0.1:
            return 'positive', abs(compound)
        elif compound <= -0.1:
            return 'negative', abs(compound)
        else:
            return 'neutral', 1 - abs(compound)

    def get_bert_sentiment_batch(self, texts, batch_size=16):
        results = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Processing BERT sentiment"):
            batch = texts[i:i+batch_size]
            batch = [text[:512] for text in batch]
            try:
                batch_results = self.bert_classifier(batch)
                for result in batch_results:
                    label = result['label']
                    if label in ['LABEL_2', 'POSITIVE']:
                        sentiment = 'positive'
                    elif label in ['LABEL_0', 'NEGATIVE']:
                        sentiment = 'negative'
                    else:
                        sentiment = 'neutral'
                    results.append((sentiment, result['score']))
            except Exception as e:
                logger.error(f"Error processing batch: {e}")
                results.extend([('neutral', 0.5)] * len(batch))
        return results

    def ensemble_sentiment(self, vader_sentiment, vader_score, bert_sentiment, bert_score):
        bert_weight = 0.7
        vader_weight = 0.3
        sentiment_scores = {
            'positive': 1,
            'neutral': 0,
            'negative': -1
        }
        vader_numeric = sentiment_scores[vader_sentiment] * vader_score
        bert_numeric = sentiment_scores[bert_sentiment] * bert_score
        ensemble_score = (bert_weight * bert_numeric + vader_weight * vader_numeric)
        if ensemble_score > 0.1:
            return 'positive', abs(ensemble_score)
        elif ensemble_score < -0.1:
            return 'negative', abs(ensemble_score)
        else:
            return 'neutral', 1 - abs(ensemble_score)

def enhanced_visualizations(df):
    plt.style.use('default')
    try:
        sns.set_palette("husl")
    except:
        pass
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    sns.countplot(data=df, x='ensemble_sentiment', ax=axes[0,0],
                  order=['positive', 'neutral', 'negative'])
    axes[0,0].set_title('Sentiment Distribution (Ensemble Model)', fontsize=14)
    axes[0,0].set_xlabel('Sentiment')
    axes[0,0].set_ylabel('Count')
    sns.histplot(data=df, x='ensemble_score', hue='ensemble_sentiment',
                 bins=30, ax=axes[0,1])
    axes[0,1].set_title('Sentiment Score Distribution', fontsize=14)
    if 'Date_of_tweet' in df.columns:
        try:
            df['Date_of_tweet'] = pd.to_datetime(df['Date_of_tweet'])
            daily_sentiment = df.groupby([df['Date_of_tweet'].dt.date, 'ensemble_sentiment']).size().unstack(fill_value=0)
            daily_sentiment.plot(kind='line', ax=axes[1,0], marker='o')
            axes[1,0].set_title('Daily Sentiment Trends', fontsize=14)
            axes[1,0].tick_params(axis='x', rotation=45)
        except Exception as e:
            logger.warning(f"Could not create time series plot: {e}")
            axes[1,0].text(0.5, 0.5, 'Time series data not available',
                          horizontalalignment='center', transform=axes[1,0].transAxes)
    if 'vader_sentiment' in df.columns and 'bert_sentiment' in df.columns:
        comparison_data = pd.DataFrame({
            'VADER': df['vader_sentiment'].value_counts(),
            'BERT': df['bert_sentiment'].value_counts(),
            'Ensemble': df['ensemble_sentiment'].value_counts()
        }).fillna(0)
        comparison_data.plot(kind='bar', ax=axes[1,1])
        axes[1,1].set_title('Model Comparison', fontsize=14)
        axes[1,1].tick_params(axis='x', rotation=45)
    plt.tight_layout()
    plt.savefig('comprehensive_sentiment_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()
    for sentiment in ['positive', 'negative', 'neutral']:
        sentiment_text = ' '.join(df[df['ensemble_sentiment'] == sentiment]['cleaned_text'])
        if sentiment_text:
            wordcloud = WordCloud(
                width=1200,
                height=600,
                background_color='white',
                max_words=100,
                collocations=False,
                colormap='viridis'
            ).generate(sentiment_text)
            plt.figure(figsize=(12, 6))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'Word Cloud - {sentiment.title()} Sentiment', fontsize=16)
            plt.savefig(f'{sentiment}_wordcloud_enhanced.png', dpi=300, bbox_inches='tight')
            plt.close()

def main():
    start_time = datetime.now()
    logger.info("Starting enhanced sentiment analysis...")
    try:
        analyzer = ImprovedSentimentAnalyzer()
        df = pd.read_csv('NEP_2020_english_tweet.csv')
        logger.info(f"Loaded {len(df)} tweets")
        df = df.dropna(subset=['Tweet'])
        df = df[df['Tweet'].str.len() > 10]
        logger.info(f"After cleaning: {len(df)} tweets")
        logger.info("Cleaning tweets...")
        df['cleaned_text'] = df['Tweet'].apply(analyzer.advanced_clean_tweet)
        df = df[df['cleaned_text'].str.len() > 0]
        logger.info("Running VADER sentiment analysis...")
        vader_results = df['cleaned_text'].apply(analyzer.get_vader_sentiment)
        df['vader_sentiment'] = vader_results.apply(lambda x: x[0])
        df['vader_score'] = vader_results.apply(lambda x: x[1])
        logger.info("Running BERT sentiment analysis...")
        bert_results = analyzer.get_bert_sentiment_batch(df['cleaned_text'].tolist())
        df['bert_sentiment'] = [result[0] for result in bert_results]
        df['bert_score'] = [result[1] for result in bert_results]
        logger.info("Creating ensemble predictions...")
        ensemble_results = df.apply(lambda row: analyzer.ensemble_sentiment(
            row['vader_sentiment'], row['vader_score'],
            row['bert_sentiment'], row['bert_score']
        ), axis=1)
        df['ensemble_sentiment'] = ensemble_results.apply(lambda x: x[0])
        df['ensemble_score'] = ensemble_results.apply(lambda x: x[1])
        logger.info("Creating visualizations...")
        enhanced_visualizations(df)
        logger.info("Calculating performance metrics...")
        vader_bert_agreement = (df['vader_sentiment'] == df['bert_sentiment']).mean()
        logger.info(f"VADER-BERT agreement: {vader_bert_agreement:.3f}")
        sentiment_dist = df['ensemble_sentiment'].value_counts(normalize=True)
        logger.info(f"Sentiment distribution: {sentiment_dist.to_dict()}")
        df.to_csv('enhanced_sentiment_results.csv', index=False)
        logger.info("Results saved to enhanced_sentiment_results.csv")
        summary = {
            'total_tweets': len(df),
            'processing_time': str(datetime.now() - start_time),
            'model_agreement': vader_bert_agreement,
            'sentiment_distribution': sentiment_dist.to_dict()
        }
        logger.info("Analysis complete!")
        return summary
    except Exception as e:
        logger.error(f"Error in main execution: {e}")
        raise

if __name__ == "__main__":
    summary = main()
    print(f"Analysis Summary: {summary}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Processing BERT sentiment:   1%|          | 10/1140 [00:02<02:49,  6.66it/s]You seem to be using the pipelines sequentially on GPU. In order 

Analysis Summary: {'total_tweets': 18238, 'processing_time': '0:03:07.215278', 'model_agreement': np.float64(0.28352889571224915), 'sentiment_distribution': {'positive': 0.5214935848228972, 'neutral': 0.4016887816646562, 'negative': 0.07681763351244654}}
