In [1]:
import os
import pandas as pd
import glob
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio

In [2]:
def load_dataframes(folder_path):
    dfs = []
    for file in glob.glob(folder_path + "/*.csv"):
        df = pd.read_csv(file)
        company_name = os.path.basename(os.path.dirname(file))
        df['company'] = company_name
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [3]:
direct_comp_df = load_dataframes("Data/Direct_Comp/*")
mainstream_comp_df = load_dataframes("Data/Mainstream_Comp/*")
potential_comp_df = load_dataframes("Data/Potential_Comp/*")

In [4]:
fampay_df = load_dataframes("Data/FampayData/*")

In [5]:
def sentiment_score(text):
    if not isinstance(text, str):
        return None
    return TextBlob(text).sentiment.polarity # type: ignore

for df in [fampay_df, direct_comp_df, mainstream_comp_df, potential_comp_df]:
    df = df[df['content'].apply(lambda x: isinstance(x, str))]
    df['sentiment'] = df['content'].apply(sentiment_score)  # type: ignore

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['content'].apply(sentiment_score)  # type: ignore
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['content'].apply(sentiment_score)  # type: ignore


In [6]:
def response_rate(df):
    total_reviews = len(df)
    total_responses = len(df[df['replyContent'].notnull()])
    return total_responses / total_reviews * 100

response_rates = {}
for df, comp_type in [(fampay_df, 'Fampay'), (direct_comp_df, 'Direct_Comp'), (mainstream_comp_df, 'Mainstream_Comp'), (potential_comp_df, 'Potential_Comp')]:
    grouped = df.groupby('company')
    response_rates[comp_type] = grouped.apply(response_rate)

In [7]:
dfs = [fampay_df, direct_comp_df, mainstream_comp_df, potential_comp_df]

for i, df in enumerate(dfs):
    df = df[df['content'].apply(lambda x: isinstance(x, str))]
    df['sentiment'] = df['content'].apply(sentiment_score) # type: ignore
    dfs[i] = df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['content'].apply(sentiment_score) # type: ignore
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['content'].apply(sentiment_score) # type: ignore


In [8]:
def sentiment_score(text):
    if not isinstance(text, str):
        return None
    return TextBlob(text).sentiment.polarity # type: ignore

fampay_df['sentiment'] = fampay_df['content'].apply(sentiment_score)

In [9]:
direct_comp_df['sentiment'] = direct_comp_df['content'].apply(sentiment_score)
mainstream_comp_df['sentiment'] = mainstream_comp_df['content'].apply(sentiment_score)
potential_comp_df['sentiment'] = potential_comp_df['content'].apply(sentiment_score)

In [10]:
def plot_time_series(df, comp_type):
    if 'sentiment' not in df.columns:
        print(f"Sentiment column not found in DataFrame for {comp_type}")
        return

    df['at'] = pd.to_datetime(df['at'])
    df['day'] = df['at'].dt.date
    daily_sentiment = df.groupby(['company', 'day'])['sentiment'].mean().reset_index()

    fig = px.line(data_frame=daily_sentiment, x='day', y='sentiment', color='company',
                  title=f"Daily Average Sentiment - {comp_type}")

    # Add Fampay line to all company types except for Fampay itself
    if comp_type != 'Fampay':
        fampay_df['at'] = pd.to_datetime(fampay_df['at'])
        fampay_df['day'] = fampay_df['at'].dt.date
        daily_sentiment_fampay = fampay_df.groupby('day')['sentiment'].mean().reset_index()
        fig.add_trace(go.Scatter(x=daily_sentiment_fampay['day'], y=daily_sentiment_fampay['sentiment'],
                                 name='Fampay', mode='lines'))

    fig.update_xaxes(title_text='Day')
    fig.update_yaxes(title_text='Sentiment')

    # Save the plot as an HTML file
    filename = f"{comp_type}_sentiment.html"
    pio.write_html(fig, file=filename, auto_open=False)

    print(f"Plot saved as {filename}")
    fig.show()

for df, comp_type in zip(dfs, ['Fampay', 'Direct_Comp', 'Mainstream_Comp', 'Potential_Comp']):
    plot_time_series(df, comp_type)

Plot saved as Fampay_sentiment.html


Plot saved as Direct_Comp_sentiment.html




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Plot saved as Mainstream_Comp_sentiment.html




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Plot saved as Potential_Comp_sentiment.html


In [11]:
def plot_time_series(df, comp_type, rolling_window=1000):
    if 'sentiment' not in df.columns:
        print(f"Sentiment column not found in DataFrame for {comp_type}")
        return

    df['at'] = pd.to_datetime(df['at'])
    df['day'] = df['at'].dt.date
    df.sort_values(by='day', inplace=True)

    df['rolling_sentiment'] = df.groupby('company')['sentiment'].transform(lambda x: x.rolling(rolling_window).mean())
    daily_sentiment = df.groupby(['company', 'day'])['rolling_sentiment'].last().reset_index()

    fig = px.line(data_frame=daily_sentiment, x='day', y='rolling_sentiment', color='company',
                  title=f"Rolling {rolling_window}-Day Average Sentiment - {comp_type}")

    if comp_type != 'Fampay':
        fampay_df['at'] = pd.to_datetime(fampay_df['at'])
        fampay_df['day'] = fampay_df['at'].dt.date
        fampay_df.sort_values(by='day', inplace=True)
        fampay_df['rolling_sentiment'] = fampay_df['sentiment'].rolling(rolling_window).mean()
        daily_sentiment_fampay = fampay_df.groupby('day')['rolling_sentiment'].last().reset_index()
        fig.add_trace(go.Scatter(x=daily_sentiment_fampay['day'], y=daily_sentiment_fampay['rolling_sentiment'],
                                 name='Fampay', mode='lines'))

    fig.update_layout(width=1200, height=500)
    fig.update_xaxes(title_text='Day')
    fig.update_yaxes(title_text='Sentiment')

    filename = f"{comp_type}_rolling_{rolling_window}_sentiment.html"
    pio.write_html(fig, file=filename, auto_open=False)

    print(f"Plot saved as {filename}")
    fig.show()

for df, comp_type in zip(dfs, ['Fampay', 'Direct_Comp', 'Mainstream_Comp', 'Potential_Comp']):
    plot_time_series(df, comp_type)


Plot saved as Fampay_rolling_1000_sentiment.html


Plot saved as Direct_Comp_rolling_1000_sentiment.html




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

Plot saved as Mainstream_Comp_rolling_1000_sentiment.html




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a

Plot saved as Potential_Comp_rolling_1000_sentiment.html


In [12]:
all_df = pd.concat(dfs, ignore_index=True)
all_df['at'] = pd.to_datetime(all_df['at'])
all_df['day'] = all_df['at'].dt.date

daily_sentiment = all_df.groupby(['company', 'day'])['sentiment'].mean().reset_index()

# Calculate rolling average
daily_sentiment['rolling_avg'] = daily_sentiment.groupby('company')['sentiment'].transform(lambda x: x.rolling(window=45, min_periods=1).mean())

# Create separate traces for each company
traces = []
for comp_type in daily_sentiment['company'].unique():
    traces.append(go.Scatter(x=daily_sentiment[daily_sentiment['company'] == comp_type]['day'],
                             y=daily_sentiment[daily_sentiment['company'] == comp_type]['rolling_avg'],
                             mode='lines',
                             name=comp_type))

# Add trace for all companies
traces.append(go.Scatter(x=daily_sentiment[daily_sentiment['company'] == 'Fampay']['day'],
                         y=daily_sentiment[daily_sentiment['company'] == 'Fampay']['rolling_avg'],
                         mode='lines',
                         name='Fampay'))

# Create layout
layout = go.Layout(title='Rolling Average Daily Sentiment - All Companies',
                   xaxis=dict(title='Day'), yaxis=dict(title='Sentiment'))

# Create figure
fig = go.Figure(data=traces, layout=layout)

# Show plot
fig.show()
pio.write_html(fig, file='all_companies_rolling_avg.html', auto_open=False)

In [13]:
dfs = [fampay_df, direct_comp_df, mainstream_comp_df, potential_comp_df]
comp_types = ['Fampay', 'Direct_Comp', 'Mainstream_Comp', 'Potential_Comp']

for df, comp_type in zip(dfs, comp_types):
    # Add Fampay data to df if not Fampay
    if comp_type != 'Fampay':
        df = pd.concat([df, fampay_df])
    
    # Sentiment Distribution (Histogram)
    fig = px.histogram(df, x="sentiment", nbins=20, title=f'Sentiment Distribution - {comp_type}')
    pio.write_html(fig, file=f'{comp_type}_sentiment_distribution.html', auto_open=False)

    # Bar Plot of Company vs Sentiment
    average_sentiment = df.groupby('company')['sentiment'].mean().reset_index()
    fig = px.bar(average_sentiment, x="company", y="sentiment", title=f'Average Sentiment Score per Company - {comp_type}')
    pio.write_html(fig, file=f'{comp_type}_average_sentiment.html', auto_open=False)

    # Sentiment Trend (Time series)
    df['at'] = pd.to_datetime(df['at'])
    df['day'] = df['at'].dt.date
    daily_sentiment = df.groupby(['company', 'day'])['sentiment'].mean().reset_index()
    fig = px.line(data_frame=daily_sentiment, x='day', y='sentiment', color='company', title=f'Daily Average Sentiment per Company - {comp_type}')
    pio.write_html(fig, file=f'{comp_type}_daily_sentiment.html', auto_open=False)

    # Box plot of Sentiment Scores
    fig = px.box(df, x="company", y="sentiment", title=f'Sentiment Scores Box Plot - {comp_type}')
    pio.write_html(fig, file=f'{comp_type}_boxplot_sentiment.html', auto_open=False)

    # Pie Chart of Response Rate
    response_rates_df = pd.DataFrame(list(response_rates[comp_type].items()), columns=['Company', 'Response Rate'])
    fig = px.pie(response_rates_df, values='Response Rate', names='Company', title=f'Response Rate per Company - {comp_type}')
    pio.write_html(fig, file=f'{comp_type}_response_rate.html', auto_open=False)

    # Scatter Plot of Sentiment vs Thumbs Up Count
    fig = px.scatter(df, x="sentiment", y="thumbsUpCount", color="company", title=f'Scatter Plot - Sentiment vs Thumbs Up Count - {comp_type}')
    pio.write_html(fig, file=f'{comp_type}_scatterplot.html', auto_open=False)

    # Heatmap of Sentiment Scores per Company per Day
    heatmap_data = df.pivot_table(values='sentiment', index='day', columns='company')
    fig = px.imshow(heatmap_data, title=f'Heatmap - Sentiment Scores per Company per Day - {comp_type}')
    pio.write_html(fig, file=f'{comp_type}_heatmap.html', auto_open=False)
