In [1]:
# Install Required Libraries
!pip install pyspark nltk wordcloud dash plotly pandas seaborn



In [2]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import StringType, FloatType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import io
import base64
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
from collections import Counter

In [3]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("SocialMediaSentimentAnalysis") \
    .getOrCreate()

In [4]:
# Download NLTK Resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('vader_lexicon', quiet=True)

True

In [5]:
# Initialize NLTK Components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
sia = SentimentIntensityAnalyzer()

In [6]:
# Define UDFs for Text Cleaning and Sentiment Analysis
def clean_text(text):
    """Clean and preprocess the text data"""
    if isinstance(text, str):
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'@\w+|#\w+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.lower().strip()
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    return ''

def get_sentiment(text):
    """Get sentiment score and label using VADER"""
    scores = sia.polarity_scores(text)
    compound_score = scores['compound']
    if compound_score > 0.05:
        return 'Positive'
    elif compound_score < -0.05:
        return 'Negative'
    else:
        return 'Neutral'

clean_text_udf = udf(clean_text, StringType())
get_sentiment_udf = udf(get_sentiment, StringType())

In [7]:
# Load Data
twitter_path = '/content/drive/MyDrive/Colab Notebooks/BDA/Twitter_Data.csv'
facebook_path = '/content/drive/MyDrive/Colab Notebooks/BDA/data_oppview.csv'
twitter_df = spark.read.csv(twitter_path, header=True, inferSchema=True)
facebook_df = spark.read.csv(facebook_path, header=True, inferSchema=True)

In [8]:
# Apply Text Cleaning and Sentiment Analysis
twitter_df = twitter_df.withColumn("cleaned_text", clean_text_udf(col("clean_text")))
facebook_df = facebook_df.withColumn("cleaned_text", clean_text_udf(col("status_message")))
twitter_df = twitter_df.withColumn("sentiment", get_sentiment_udf(col("cleaned_text")))
facebook_df = facebook_df.withColumn("sentiment", get_sentiment_udf(col("cleaned_text")))

In [9]:
# Convert Sentiment to Numeric Using StringIndexer
indexer = StringIndexer(inputCol="sentiment", outputCol="sentiment_index")
twitter_df = indexer.fit(twitter_df).transform(twitter_df)
facebook_df = indexer.fit(facebook_df).transform(facebook_df)

In [10]:
# Combine Twitter and Facebook Data for Visualization
combined_df = twitter_df.select("sentiment", "cleaned_text").withColumn("Platform", lit("Twitter")) \
    .union(facebook_df.select("sentiment", "cleaned_text").withColumn("Platform", lit("Facebook")))

# Convert to Pandas DataFrame
combined_pandas = combined_df.toPandas()

In [11]:
# Helper Function for Word Cloud Generation
def generate_word_cloud(text, platform, sentiment):
    """Generate Word Cloud and return it as an image encoded in base64."""
    if text.strip():
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        buf = io.BytesIO()
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'{platform} {sentiment} Word Cloud', fontsize=16)
        plt.savefig(buf, format='png')
        buf.seek(0)
        image_base64 = base64.b64encode(buf.read()).decode('utf-8')
        buf.close()
        plt.close()
        return f'data:image/png;base64,{image_base64}'
    return None

In [12]:
# Initialize Dash App
app = dash.Dash(__name__)

# Layout of the Dashboard
app.layout = html.Div([
    html.H1("Social Media Sentiment Analysis Dashboard", style={'textAlign': 'center', 'color': '#2c3e50'}),

    # Dropdown to Select Platform
    html.Div([
        html.Label("Select Platform:", style={'fontSize': 18}),
        dcc.Dropdown(
            id='platform-dropdown',
            options=[
                {'label': 'Twitter', 'value': 'twitter'},
                {'label': 'Facebook', 'value': 'facebook'}
            ],
            value='twitter',  # Default value
            clearable=False,
            style={'width': '50%', 'margin': 'auto'}
        )
    ], style={'marginBottom': 20}),

    # Sentiment Distribution Pie Chart and Bar Chart
    html.Div([
        dcc.Graph(id='sentiment-pie-chart', style={'width': '48%', 'display': 'inline-block'}),
        dcc.Graph(id='sentiment-bar-chart', style={'width': '48%', 'display': 'inline-block'}),
    ], style={'display': 'flex', 'justifyContent': 'center', 'marginBottom': '20px'}),

    # Sentiment Score Distribution Histogram
    html.Div([
        dcc.Graph(id='sentiment-score-histogram')
    ]),

    # Word Clouds
    html.Div([
        html.H3("Word Clouds", style={'textAlign': 'center'}),
        html.Div(id='word-cloud-container', style={'display': 'flex', 'justifyContent': 'space-around'})
    ]),

    # Text Input and Prediction Section
    html.Div([
        html.Label("Enter Text for Sentiment Prediction:", style={'fontSize': 16}),
        dcc.Textarea(
            id='input-text',
            placeholder='Enter a sentence here...',
            style={'width': '100%', 'height': '80px', 'marginBottom': '10px'}
        ),
        html.Button(
            'Predict Sentiment',
            id='predict-button',
            n_clicks=0,
            style={'padding': '10px 20px', 'backgroundColor': '#3498db', 'color': 'white', 'border': 'none', 'cursor': 'pointer'}
        ),
        html.Div(id='prediction-output', style={'marginTop': '20px', 'fontSize': 18})
    ], style={'textAlign': 'left', 'width': '50%', 'margin': 'auto'}),
])

# Callback for Sentiment Distribution Pie Chart
@app.callback(
    Output('sentiment-pie-chart', 'figure'),
    [Input('platform-dropdown', 'value')]
)
def update_pie_chart(selected_platform):
    df = combined_pandas[combined_pandas['Platform'] == ('Twitter' if selected_platform == 'twitter' else 'Facebook')]
    sentiment_counts = df['sentiment'].value_counts().reset_index()
    sentiment_counts.columns = ['sentiment', 'count']
    fig = px.pie(
        sentiment_counts,
        names='sentiment',
        values='count',
        title=f"Sentiment Distribution on {'Twitter' if selected_platform == 'twitter' else 'Facebook'}",
        color_discrete_sequence=['#2ecc71', '#e74c3c', '#95a5a6']
    )
    return fig

# Callback for Sentiment Distribution Bar Chart
@app.callback(
    Output('sentiment-bar-chart', 'figure'),
    [Input('platform-dropdown', 'value')]
)
def update_bar_chart(selected_platform):
    df = combined_pandas[combined_pandas['Platform'] == ('Twitter' if selected_platform == 'twitter' else 'Facebook')]
    sentiment_counts = df['sentiment'].value_counts().reset_index()
    sentiment_counts.columns = ['sentiment', 'count']
    fig = px.bar(
        sentiment_counts,
        x='sentiment',
        y='count',
        color='sentiment',
        title=f"Sentiment Distribution on {'Twitter' if selected_platform == 'twitter' else 'Facebook'}",
        color_discrete_sequence=['#2ecc71', '#e74c3c', '#95a5a6']
    )
    return fig

# Callback for Sentiment Score Distribution Histogram
@app.callback(
    Output('sentiment-score-histogram', 'figure'),
    [Input('platform-dropdown', 'value')]
)
def update_histogram(selected_platform):
    df = combined_pandas[combined_pandas['Platform'] == ('Twitter' if selected_platform == 'twitter' else 'Facebook')]
    df['sentiment_score'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
    fig = px.histogram(
        df,
        x='sentiment_score',
        nbins=30,
        title=f"Sentiment Score Distribution on {'Twitter' if selected_platform == 'twitter' else 'Facebook'}",
        color_discrete_sequence=['#3498db']
    )
    return fig

# Callback for Word Clouds
@app.callback(
    Output('word-cloud-container', 'children'),
    [Input('platform-dropdown', 'value')]
)
def update_word_clouds(selected_platform):
    df = combined_pandas[combined_pandas['Platform'] == ('Twitter' if selected_platform == 'twitter' else 'Facebook')]
    platform_name = 'Twitter' if selected_platform == 'twitter' else 'Facebook'
    word_clouds = []
    for sentiment in ['Positive', 'Negative', 'Neutral']:
        text = ' '.join(df[df['sentiment'] == sentiment]['cleaned_text'])
        word_cloud_image = generate_word_cloud(text, platform_name, sentiment)
        if word_cloud_image:
            word_clouds.append(html.Div([
                html.Img(src=word_cloud_image, style={'width': '300px', 'height': '200px'}),
                html.P(sentiment, style={'textAlign': 'center', 'fontWeight': 'bold'})
            ]))
    return word_clouds

# Callback for Sentiment Prediction
@app.callback(
    Output('prediction-output', 'children'),
    [Input('predict-button', 'n_clicks')],
    [dash.dependencies.State('input-text', 'value')]
)
def predict_sentiment_and_clean_text(n_clicks, input_text):
    if n_clicks > 0 and input_text.strip():
        cleaned_text = clean_text(input_text)
        sentiment = get_sentiment(cleaned_text)
        compound_score = sia.polarity_scores(cleaned_text)['compound']
        return html.Div([
            html.P(f"Predicted Sentiment: {sentiment}", style={'color': 'black', 'fontSize': 18, 'fontWeight': 'bold'}),
            html.P(f"Compound Score: {compound_score:.2f}", style={'color': 'blue', 'fontSize': 16}),
            html.P(f"Cleaned Text: {cleaned_text}", style={'color': 'green', 'fontSize': 14}),
        ])
    return "Enter text and click 'Predict Sentiment'."

# Run the Dash App
if __name__ == '__main__':
    app.run(debug=True, port=8050)

<IPython.core.display.Javascript object>