In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
import spacy
import re
from collections import Counter
from typing import List, Dict
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

class KosovoSerbiaAnalyzer:
    def __init__(self):
        """Initialize analyzer with components specific to Kosovo-Serbia context"""
        # Initialize sentiment analyzer
        try:
            self.sentiment_analyzer = pipeline("sentiment-analysis")
        except:
            print("Warning: Using basic sentiment analysis as transformer model couldn't be loaded")

        # Load spacy model
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except:
            print("Warning: Install spaCy model with 'python -m spacy download en_core_web_sm'")

        # Define context-specific keywords
        self.keywords = {
            'conflict_terms': [
                'war', 'conflict', 'tension', 'violence', 'attack', 'terrorist',
                'terrorism', 'bomb', 'military', 'invasion', 'occupation',
                'fight', 'battle', 'aggression', 'genocide', 'cleansing'
            ],
            'ethnic_identifiers': [
                'serb', 'serbian', 'albanian', 'kosovo', 'kosova', 'albanians',
                'serbs', 'bosniak', 'yugoslav', 'balkan'
            ],
            'political_terms': [
                'nato', 'independence', 'territory', 'eu', 'recognition',
                'parliament', 'government', 'diplomatic', 'minister', 'president'
            ],
            'sentiment_indicators': {
                'positive': [
                    'peace', 'cooperation', 'dialogue', 'progress', 'development',
                    'support', 'recognition', 'agreement', 'friendship', 'collaboration'
                ],
                'negative': [
                    'genocide', 'cleansing', 'hate', 'terrorism', 'violation',
                    'invasion', 'occupation', 'threat', 'crisis', 'conflict'
                ]
            }
        }

    def clean_tweet(self, tweet: str) -> str:
        """Clean and preprocess tweet text"""
        if not isinstance(tweet, str):
            return ""

        # Convert to lowercase
        text = tweet.lower()

        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)

        # Remove mentions
        text = re.sub(r'@\w+', '', text)

        # Remove hashtags but keep the text
        text = re.sub(r'#', '', text)

        # Remove emojis and special characters
        text = re.sub(r'[^\w\s,.]', '', text)

        # Remove extra whitespace
        text = ' '.join(text.split())

        return text

    def analyze_sentiment(self, tweet: str) -> Dict:
        """Analyze sentiment with context-specific adjustments"""
        cleaned_text = self.clean_tweet(tweet)

        try:
            # Get base sentiment from transformer
            base_sentiment = self.sentiment_analyzer(cleaned_text)[0]
            sentiment_score = float(base_sentiment['score'])
            sentiment_label = base_sentiment['label']
        except:
            # Fallback to basic sentiment analysis
            positive_words = sum(1 for word in self.keywords['sentiment_indicators']['positive']
                               if word in cleaned_text)
            negative_words = sum(1 for word in self.keywords['sentiment_indicators']['negative']
                               if word in cleaned_text)
            sentiment_score = (positive_words - negative_words) / (positive_words + negative_words + 1)
            sentiment_label = 'POSITIVE' if sentiment_score > 0 else 'NEGATIVE'

        # Context adjustment
        context_score = 0
        for pos_term in self.keywords['sentiment_indicators']['positive']:
            if pos_term in cleaned_text:
                context_score += 0.1
        for neg_term in self.keywords['sentiment_indicators']['negative']:
            if neg_term in cleaned_text:
                context_score -= 0.1

        adjusted_score = min(1.0, max(0.0, sentiment_score + context_score))

        return {
            'sentiment': sentiment_label,
            'base_score': sentiment_score,
            'adjusted_score': adjusted_score,
            'context_modification': context_score
        }

    def analyze_ethnic_references(self, tweet: str) -> Dict:
        """Analyze references to ethnic groups and communities"""
        text = self.clean_tweet(tweet)

        references = {
            'serbian': 0,
            'albanian': 0,
            'other_ethnic': 0,
            'balanced': False
        }

        # Count Serbian references
        serbian_patterns = ['serb', 'serbian', 'serbia']
        for pattern in serbian_patterns:
            references['serbian'] += len(re.findall(r'\b' + pattern + r'\w*\b', text, re.I))

        # Count Albanian references
        albanian_patterns = ['albania', 'albanian', 'kosovo']
        for pattern in albanian_patterns:
            references['albanian'] += len(re.findall(r'\b' + pattern + r'\w*\b', text, re.I))

        # Count other ethnic references
        other_patterns = ['bosniak', 'croatian', 'macedonian']
        for pattern in other_patterns:
            references['other_ethnic'] += len(re.findall(r'\b' + pattern + r'\w*\b', text, re.I))

        # Check if reference is balanced
        references['balanced'] = abs(references['serbian'] - references['albanian']) <= 1

        return references

    def analyze_conflict_indicators(self, tweet: str) -> Dict:
        """Analyze conflict-related content and tone"""
        text = self.clean_tweet(tweet)

        indicators = {
            'conflict_terms': 0,
            'peace_terms': 0,
            'historical_reference': False,
            'current_events': False
        }

        # Count conflict terms
        for term in self.keywords['conflict_terms']:
            indicators['conflict_terms'] += len(re.findall(r'\b' + term + r'\w*\b', text, re.I))

        # Count peace terms
        peace_terms = ['peace', 'dialogue', 'cooperation', 'agreement', 'friendship']
        for term in peace_terms:
            indicators['peace_terms'] += len(re.findall(r'\b' + term + r'\w*\b', text, re.I))

        # Check for historical references
        historical_patterns = ['1999', '90s', 'history', 'historical', 'ottoman', 'yugoslavia']
        indicators['historical_reference'] = any(pattern in text for pattern in historical_patterns)

        # Check for current events references
        current_patterns = ['today', 'now', 'current', 'recent', 'latest']
        indicators['current_events'] = any(pattern in text for pattern in current_patterns)

        return indicators

    def analyze_dataset(self, tweets_df: pd.DataFrame) -> pd.DataFrame:
        """Analyze entire dataset of tweets"""
        results = []

        for _, row in tweets_df.iterrows():
            tweet = row['Tweet']

            # Skip invalid tweets
            if not isinstance(tweet, str):
                continue

            # Perform analyses
            sentiment = self.analyze_sentiment(tweet)
            ethnic_refs = self.analyze_ethnic_references(tweet)
            conflict_indicators = self.analyze_conflict_indicators(tweet)

            # Compile results
            analysis = {
                'original_tweet': tweet,
                'cleaned_tweet': self.clean_tweet(tweet),
                'sentiment_label': sentiment['sentiment'],
                'sentiment_score': sentiment['adjusted_score'],
                'serbian_references': ethnic_refs['serbian'],
                'albanian_references': ethnic_refs['albanian'],
                'other_ethnic_references': ethnic_refs['other_ethnic'],
                'balanced_reference': ethnic_refs['balanced'],
                'conflict_terms': conflict_indicators['conflict_terms'],
                'peace_terms': conflict_indicators['peace_terms'],
                'historical_reference': conflict_indicators['historical_reference'],
                'current_events': conflict_indicators['current_events']
            }

            results.append(analysis)

        return pd.DataFrame(results)

    def cross_validate_sentiment(self, tweets_df: pd.DataFrame, n_splits=5) -> Dict:
        """Perform cross-validation on sentiment analysis"""
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
        scores = []

        # Prepare data
        X = tweets_df['Tweet'].values
        y = [1 if any(term in str(tweet).lower() for term in self.keywords['sentiment_indicators']['positive'])
         else 0 for tweet in X]

        for train_idx, test_idx in kf.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = np.array(y)[train_idx], np.array(y)[test_idx]

            # Train on fold
            train_predictions = [self.analyze_sentiment(str(tweet))['adjusted_score'] > 0.5
                               for tweet in X_train]
            test_predictions = [self.analyze_sentiment(str(tweet))['adjusted_score'] > 0.5
                              for tweet in X_test]

            # Calculate accuracy for this fold
            accuracy = sum(p == t for p, t in zip(test_predictions, y_test)) / len(y_test)
            scores.append(accuracy)

        return {
            'mean_accuracy': np.mean(scores),
            'std_accuracy': np.std(scores),
            'fold_scores': scores
        }

    def analyze_topic_distribution(self, analysis_df: pd.DataFrame) -> Dict:
        """Analyze distribution of main topics in tweets"""
        topics = {
            'political': ['government', 'president', 'parliament', 'minister', 'election', 'policy'],
            'conflict': ['war', 'fight', 'battle', 'attack', 'military', 'weapon'],
            'ethnic': ['serbian', 'albanian', 'ethnic', 'minority', 'community'],
            'economic': ['business', 'economy', 'trade', 'development', 'investment'],
            'cultural': ['tradition', 'heritage', 'religion', 'culture', 'history'],
            'international': ['nato', 'eu', 'un', 'international', 'foreign']
        }

        topic_counts = {topic: 0 for topic in topics.keys()}

        for _, row in analysis_df.iterrows():
            text = row['cleaned_tweet'].lower()
            for topic, keywords in topics.items():
                if any(keyword in text for keyword in keywords):
                    topic_counts[topic] += 1

        # Calculate percentages
        total_tweets = len(analysis_df)
        topic_percentages = {
            topic: (count / total_tweets) * 100
            for topic, count in topic_counts.items()
        }

        return {
            'counts': topic_counts,
            'percentages': topic_percentages
        }

    def analyze_hashtag_patterns(self, tweets_df: pd.DataFrame) -> Dict:
        """Analyze hashtag usage patterns"""
        hashtag_pattern = r'#\w+'
        hashtags = []

        for tweet in tweets_df['Tweet']:
            if isinstance(tweet, str):
                hashtags.extend(re.findall(hashtag_pattern, tweet))

        hashtag_counts = Counter(hashtags)

        return {
            'most_common': hashtag_counts.most_common(10),
            'total_hashtags': len(hashtags),
            'unique_hashtags': len(hashtag_counts),
            'hashtag_frequency': dict(hashtag_counts)
        }

    def analyze_user_interactions(self, tweets_df: pd.DataFrame) -> Dict:
        """Analyze user interaction patterns"""
        mention_pattern = r'@\w+'
        mentions = []

        for tweet in tweets_df['Tweet']:
            if isinstance(tweet, str):
                mentions.extend(re.findall(mention_pattern, tweet))

        mention_counts = Counter(mentions)

        return {
            'most_mentioned': mention_counts.most_common(10),
            'total_mentions': len(mentions),
            'unique_mentions': len(mention_counts),
            'mention_frequency': dict(mention_counts)
        }

    def analyze_temporal_patterns(self, analysis_df: pd.DataFrame) -> Dict:
        """Analyze patterns over time in the dataset"""
        # Create rolling averages for different metrics
        window_size = 10

        temporal_patterns = {
            'sentiment_trend': analysis_df['sentiment_score'].rolling(window=window_size).mean(),
            'conflict_terms_trend': analysis_df['conflict_terms'].rolling(window=window_size).mean(),
            'peace_terms_trend': analysis_df['peace_terms'].rolling(window=window_size).mean(),
        }

        # Calculate correlation between different metrics
        correlations = {
            'sentiment_conflict': analysis_df['sentiment_score'].corr(analysis_df['conflict_terms']),
            'sentiment_peace': analysis_df['sentiment_score'].corr(analysis_df['peace_terms']),
            'conflict_peace': analysis_df['conflict_terms'].corr(analysis_df['peace_terms'])
        }

        return {
            'trends': temporal_patterns,
            'correlations': correlations
        }

    def analyze_language_complexity(self, analysis_df: pd.DataFrame) -> Dict:
        """Analyze complexity and sophistication of language used"""
        complexity_metrics = {
            'avg_tweet_length': [],
            'word_count': [],
            'unique_words': []
        }

        for tweet in analysis_df['cleaned_tweet']:
            words = tweet.split()
            complexity_metrics['avg_tweet_length'].append(len(tweet))
            complexity_metrics['word_count'].append(len(words))
            complexity_metrics['unique_words'].append(len(set(words)))

        return {
            'averages': {
                'avg_length': np.mean(complexity_metrics['avg_tweet_length']),
                'avg_words': np.mean(complexity_metrics['word_count']),
                'avg_unique': np.mean(complexity_metrics['unique_words'])
            },
            'distributions': complexity_metrics
        }

    def visualize_analysis(self, analysis_df: pd.DataFrame, save_path: str = None):
        """Generate comprehensive visualizations of the analysis"""
        plt.style.use('seaborn')

        # Create figure with subplots
        fig = plt.figure(figsize=(20, 15))


 # 1. Sentiment Distribution
        plt.subplot(2, 2, 1)
        sns.histplot(data=analysis_df, x='sentiment_score', bins=20)
        sns.set_style("whitegrid")  # or any other style
        plt.title('Distribution of Sentiment Scores')
        plt.xlabel('Sentiment Score')
        plt.ylabel('Count')

        # 2. Ethnic References
        plt.subplot(2, 2, 2)
        ethnic_data = {
            'Serbian': analysis_df['serbian_references'].sum(),
            'Albanian': analysis_df['albanian_references'].sum(),
            'Other': analysis_df['other_ethnic_references'].sum()
        }
        plt.pie(ethnic_data.values(), labels=ethnic_data.keys(), autopct='%1.1f%%')
        plt.title('Distribution of Ethnic References')

        # 3. Conflict vs Peace Terms
        plt.subplot(2, 2, 3)
        plt.scatter(analysis_df['conflict_terms'], analysis_df['peace_terms'],
                   alpha=0.5, c=analysis_df['sentiment_score'], cmap='RdYlBu')
        plt.colorbar(label='Sentiment Score')
        plt.title('Conflict vs Peace Terms Usage')
        plt.xlabel('Number of Conflict Terms')
        plt.ylabel('Number of Peace Terms')

        # 4. Time References
        plt.subplot(2, 2, 4)
        time_refs = pd.Series({
            'Historical': sum(analysis_df['historical_reference']),
            'Current': sum(analysis_df['current_events']),
            'Unspecified': len(analysis_df) - sum(analysis_df['historical_reference'])
                          - sum(analysis_df['current_events'])
        })
        time_refs.plot(kind='bar')
        plt.title('Distribution of Time References')
        plt.xticks(rotation=45)

        plt.tight_layout()

        if save_path:
            plt.savefig(f"{save_path}/analysis_overview.png", dpi=300, bbox_inches='tight')

        plt.show()

    def generate_wordclouds(self, analysis_df: pd.DataFrame, save_path: str = None):
        """Generate word clouds for different sentiment categories"""
        # Create figure with subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

        # Positive tweets wordcloud
        positive_tweets = ' '.join(analysis_df[analysis_df['sentiment_score'] > 0.6]['cleaned_tweet'])
        wordcloud_pos = WordCloud(
            width=800, height=400,
            background_color='white',
            colormap='YlGn',
            max_words=100
        ).generate(positive_tweets)

        ax1.imshow(wordcloud_pos)
        ax1.axis('off')
        ax1.set_title('Positive Content Word Cloud')

        # Negative tweets wordcloud
        negative_tweets = ' '.join(analysis_df[analysis_df['sentiment_score'] < 0.4]['cleaned_tweet'])
        wordcloud_neg = WordCloud(
            width=800, height=400,
            background_color='white',
            colormap='RdGy',
            max_words=100
        ).generate(negative_tweets)

        ax2.imshow(wordcloud_neg)
        ax2.axis('off')
        ax2.set_title('Negative Content Word Cloud')

        plt.tight_layout()

        if save_path:
            plt.savefig(f"{save_path}/wordclouds.png", dpi=300, bbox_inches='tight')

        plt.show()

    def export_analysis(self, analysis_df: pd.DataFrame, output_dir: str = 'output'):
        """Export analysis results in multiple formats"""
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save main analysis results
        analysis_df.to_csv(f"{output_dir}/analysis_results.csv", index=False)
        analysis_df.to_excel(f"{output_dir}/analysis_results.xlsx", index=False)

        # Generate and save summary statistics
        summary = {
            'basic_stats': analysis_df.describe().to_dict(),
            'topic_distribution': self.analyze_topic_distribution(analysis_df),
            'language_complexity': self.analyze_language_complexity(analysis_df),
            'temporal_patterns': {
                'correlations': self.analyze_temporal_patterns(analysis_df)['correlations']
            }
        }

        with open(f"{output_dir}/analysis_summary.json", 'w') as f:
            json.dump(summary, f, indent=4)

        # Save the complete analysis object for later use
        with open(f"{output_dir}/analysis_object.pkl", 'wb') as f:
            pickle.dump({
                'analysis_df': analysis_df,
                'summary': summary
            }, f)

        print(f"Analysis results exported to {output_dir}/")

    def create_interactive_dashboard(self, analysis_df: pd.DataFrame):
        """Create an interactive dashboard using plotly"""
        # Create subplot figure
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=(
                'Sentiment Distribution',
                'Ethnic References',
                'Conflict vs Peace Terms',
                'Time References'
            ),
            specs=[
                [{"type": "histogram"}, {"type": "pie"}],
                [{"type": "scatter"}, {"type": "bar"}]
            ]
        )

        # 1. Sentiment Distribution
        fig.add_trace(
            go.Histogram(
                x=analysis_df['sentiment_score'],
                nbinsx=20,
                name='Sentiment'
            ),
            row=1, col=1
        )

        # 2. Ethnic References
        ethnic_data = {
            'Serbian': analysis_df['serbian_references'].sum(),
            'Albanian': analysis_df['albanian_references'].sum(),
            'Other': analysis_df['other_ethnic_references'].sum()
        }
        fig.add_trace(
            go.Pie(
                labels=list(ethnic_data.keys()),
                values=list(ethnic_data.values()),
                name='Ethnic References'
            ),
            row=1, col=2
        )

        # 3. Conflict vs Peace Terms
        fig.add_trace(
            go.Scatter(
                x=analysis_df['conflict_terms'],
                y=analysis_df['peace_terms'],
                mode='markers',
                marker=dict(
                    color=analysis_df['sentiment_score'],
                    colorscale='RdYlBu',
                    showscale=True
                ),
                name='Terms Usage'
            ),
            row=2, col=1
        )

        # 4. Time References
        time_refs = pd.Series({
            'Historical': sum(analysis_df['historical_reference']),
            'Current': sum(analysis_df['current_events']),
            'Unspecified': len(analysis_df) - sum(analysis_df['historical_reference'])
                          - sum(analysis_df['current_events'])
        })
        fig.add_trace(
            go.Bar(
                x=time_refs.index,
                y=time_refs.values,
                name='Time References'
            ),
            row=2, col=2
        )

        # Update layout
        fig.update_layout(
            height=800,
            showlegend=False,
            title_text="Kosovo-Serbia Tweet Analysis Dashboard",
            title_x=0.5
        )

        return fig

def main():
    """Main execution function"""
    # Set random seed for reproducibility
    np.random.seed(42)

    try:
        # Load the dataset
        print("Loading dataset...")
        # tweets_df = pd.read_csv("config/tweets.csv")

        tweets_df = pd.read_csv("config/tweets.csv", encoding='latin-1')  # or 'cp1252'

        # Initialize analyzer
        print("Initializing analyzer...")
        analyzer = KosovoSerbiaAnalyzer()

        # Perform cross-validation
        print("Performing cross-validation...")
        cv_results = analyzer.cross_validate_sentiment(tweets_df)
        print(f"\nCross-validation results:")
        print(f"Mean accuracy: {cv_results['mean_accuracy']:.3f} (±{cv_results['std_accuracy']:.3f})")

        # Analyze tweets
        print("\nAnalyzing tweets...")
        analysis_results = analyzer.analyze_dataset(tweets_df)

        # Generate visualizations
        print("\nGenerating visualizations...")
        analyzer.visualize_analysis(analysis_results, save_path='visualizations')
        analyzer.generate_wordclouds(analysis_results, save_path='visualizations')

        # Create and save interactive dashboard
        print("\nCreating interactive dashboard...")
        dashboard = analyzer.create_interactive_dashboard(analysis_results)
        dashboard.write_html("visualizations/dashboard.html")

        # Export results
        print("\nExporting analysis results...")
        analyzer.export_analysis(analysis_results)

        print("\nAnalysis complete! Check the output directory for results.")

    except Exception as e:
        print(f"\nError occurred: {str(e)}")
        raise

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'matplotlib'