In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from wordcloud import WordCloud
from textblob import TextBlob
import re
from datetime import datetime

In [8]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pegu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [10]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pegu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
class SentimentAnalysisToolkit:
    def __init__(self, df):
        """
        Initialize the toolkit with a pandas DataFrame containing sentiment data
        Expected columns: text, sentiment, source, datetime, user_id, location, confidence_score
        """
        self.df = df
        self.df["datetime"] = pd.to_datetime(self.df["Date/Time"])

    def preprocess_text(self, text):
        """Text preprocessing function"""
        # Convert to lowercase
        text = text.lower()

        # Remove special characters and numbers
        text = re.sub(r"[^a-zA-Z\s]", "", text)

        # Tokenization
        tokens = word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        return " ".join(tokens)

    def preprocess_dataset(self):
        """Apply preprocessing to entire dataset"""
        self.df["processed_text"] = self.df["Comments"].apply(self.preprocess_text)
        return self.df

    def calculate_statistics(self):
        """Calculate key statistics about the sentiment data"""
        stats = {
            "total_records": len(self.df),
            "sentiment_distribution": self.df["Sentiment"].value_counts().to_dict(),
            "avg_confidence": self.df["Confidence Score"].mean(),
            "source_distribution": self.df["Source"].value_counts().to_dict(),
            "locations_count": self.df["Location"].nunique(),
            "unique_users": self.df["User ID"].nunique(),
            "temporal_stats": {
                "earliest_date": self.df["datetime"].min(),
                "latest_date": self.df["datetime"].max(),
                "time_span_days": (
                    self.df["datetime"].max() - self.df["datetime"].min()
                ).days,
            },
        }
        return stats

    def plot_sentiment_trends(self):
        """Create visualizations for sentiment trends"""
        plt.figure(figsize=(15, 10))

        # Subplot 1: Sentiment distribution
        plt.subplot(2, 2, 1)
        sns.countplot(data=self.df, x="sentiment")
        plt.title("Sentiment Distribution")

        # Subplot 2: Average confidence score by source
        plt.subplot(2, 2, 2)
        avg_conf_by_source = self.df.groupby("source")["confidence_score"].mean()
        avg_conf_by_source.plot(kind="bar")
        plt.title("Average Confidence Score by Source")
        plt.xticks(rotation=45)

        # Subplot 3: Sentiment over time
        plt.subplot(2, 2, 3)
        daily_sentiment = (
            self.df.groupby([self.df["datetime"].dt.date, "Sentiment"]).size().unstack()
        )
        daily_sentiment.plot(kind="line")
        plt.title("Sentiment Trends Over Time")
        plt.xticks(rotation=45)

        # Subplot 4: Word cloud
        plt.subplot(2, 2, 4)
        text = " ".join(self.df[self.df["Sentiment"] == "positive"]["processed_text"])
        wordcloud = WordCloud(width=800, height=400, background_color="white").generate(
            text
        )
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.title("Word Cloud - Positive Sentiments")

        plt.tight_layout()
        plt.show()

    def analyze_text_patterns(self):
        """Analyze common patterns and themes in the text"""
        # Calculate most common words by sentiment
        sentiment_words = {}
        for sentiment in self.df["Sentiment"].unique():
            text = " ".join(
                self.df[self.df["Sentiment"] == sentiment]["processed_text"]
            )
            words = word_tokenize(text)
            freq_dist = nltk.FreqDist(words)
            sentiment_words[sentiment] = freq_dist.most_common(10)

        # Calculate average sentiment polarity using TextBlob
        self.df["polarity"] = self.df["Comments"].apply(
            lambda x: TextBlob(x).sentiment.polarity
        )

        return {
            "common_words_by_sentiment": sentiment_words,
            "avg_polarity": self.df["polarity"].mean(),
            "polarity_by_source": self.df.groupby("Source")["polarity"].mean(),
        }

In [11]:
# Example usage
def main():
    # Sample data structure (replace with your actual data)
    # data = {
    #     'text': ['Great product!', 'Poor service', 'Amazing experience'],
    #     'sentiment': ['positive', 'negative', 'positive'],
    #     'source': ['twitter', 'review', 'testimonial'],
    #     'datetime': ['2024-01-01', '2024-01-02', '2024-01-03'],
    #     'user_id': [1, 2, 3],
    #     'location': ['NY', 'CA', 'TX'],
    #     'confidence_score': [0.9, 0.8, 0.95]
    # }
    # df = pd.DataFrame(data)
    df = pd.read_csv("./sentiment-analysis-cleaned-test.csv")

    # Initialize toolkit
    toolkit = SentimentAnalysisToolkit(df)

    # Preprocess data
    processed_df = toolkit.preprocess_dataset()

    # Get statistics
    stats = toolkit.calculate_statistics()
    print("Statistics:", stats)

    # Create visualizations
    toolkit.plot_sentiment_trends()

    # Analyze text patterns
    patterns = toolkit.analyze_text_patterns()
    print("Text Patterns:", patterns)


if __name__ == "__main__":
    main()

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\pegu/nltk_data'
    - 'c:\\Python312\\nltk_data'
    - 'c:\\Python312\\share\\nltk_data'
    - 'c:\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\pegu\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [12]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pegu\AppData\Roaming\nltk_data...


True