In [None]:
pip install pandas matplotlib seaborn nltk


In [None]:
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk


# Step 1: Download the VADER lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Step 2: Path to your ZIP file
zip_file_path = r"C:\Users\madhe\Downloads\archive.zip"

# Step 3: Open the ZIP file and read the 'twitter_training.csv' file
with zipfile.ZipFile(zip_file_path, 'r') as z:
    # List all files in the ZIP to see what's inside
    print(z.namelist())  # This will show ['twitter_training.csv', 'twitter_validation.csv']
    
    # Load 'twitter_training.csv' from the ZIP
    with z.open('twitter_training.csv') as f:
        df = pd.read_csv(f)

# Step 4: Check the actual column names in the dataset
print(df.columns)

# Step 5: Preprocess the text data (use the correct column name for the text)
# In this case, it seems the column containing text is 'im getting on borderlands and i will murder you all ,'
df.dropna(subset=['im getting on borderlands and i will murder you all ,'], inplace=True)

# Step 6: Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Step 7: Define a function to get sentiment polarity scores for each text
def get_sentiment_score(text):
    score = sia.polarity_scores(text)
    return score['compound']  # Compound score is between -1 (negative) and +1 (positive)

# Step 8: Apply the sentiment analysis function to the actual text column
df['sentiment'] = df['im getting on borderlands and i will murder you all ,'].apply(get_sentiment_score)

# Step 9: Visualize the distribution of sentiment scores
plt.figure(figsize=(10,6))
sns.histplot(df['sentiment'], bins=30, kde=True, color='purple')
plt.title('Sentiment Score Distribution')
plt.xlabel('Sentiment Score (Compound)')
plt.ylabel('Frequency')
plt.show()

# Step 10: Visualize sentiment over time (if the dataset has a 'date' column)
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])  # Ensure the 'date' column is datetime type
    plt.figure(figsize=(10,6))
    sns.lineplot(x='date', y='sentiment', data=df, color='blue')
    plt.title('Sentiment Over Time')
    plt.xlabel('Date')
    plt.ylabel('Sentiment Score (Compound)')
    plt.show()

# Step 11: Calculate the average sentiment score
average_sentiment = df['sentiment'].mean()
print(f'Average Sentiment Score: {average_sentiment}')

# Step 12: Function to create a gradient circle visualization for the average sentiment
def plot_gradient_circle(sentiment_score):
    fig, ax = plt.subplots()

    # Normalize sentiment_score to [0, 1] (negative sentiment: red, positive sentiment: green)
    norm_score = (sentiment_score + 1) / 2  # Normalize to [0,1]
    
    # Create a color map that moves from red (-1) to green (+1)
    cmap = plt.get_cmap("RdYlGn")
    color = cmap(norm_score)

    # Draw the circle with gradient color
    circle = plt.Circle((0.5, 0.5), 0.4, color=color, transform=ax.transAxes)
    ax.add_patch(circle)
    
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_aspect(1)

    # Hide the axes
    ax.axis('off')
    
    plt.show()

# Step 13: Plot the gradient circle for the average sentiment
plot_gradient_circle(average_sentiment)
