## Importing libralries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from collections import Counter
import ast

## Loading and inspecting data

In [None]:

social_df = pd.read_csv('sentimentdataset.csv', index_col='User')
social_df.info()

In [None]:
social_df.head()

## Cleaning data

In [None]:
social_df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True)
social_df.head()

In [None]:
# Define common country name mappings
country_replacements = {
    'Usa': 'United States',
    'U.S.A': 'United States',
    'United States Of America': 'United States',
    'Uk': 'United Kingdom',
    'Uae': 'United Arab Emirates',
    'South Korea': 'Korea, South',
    'Russia': 'Russian Federation',
    # Add more as needed
}

# Apply replacements after stripping and title-casing
social_df['Country'] = (
    social_df['Country']
    .str.strip()
    .str.title()
    .replace(country_replacements)
)


In [None]:
print(social_df['Country'].value_counts().head(20))


In [None]:
social_df.to_csv('social_data_cleaned.csv', index=False)


In [None]:
social_df_cleaned = social_df.copy()
social_df_cleaned['Sentiment'] = social_df_cleaned['Sentiment'].str.strip().str.capitalize()
social_df_cleaned['Platform'] = social_df_cleaned['Platform'].str.strip().str.capitalize()
social_df_cleaned['Hashtags'] = social_df_cleaned['Hashtags'].str.lower().str.replace('#', '').str.split()

sentiment_summary = social_df_cleaned.groupby(['Platform', 'Sentiment']).size().unstack().fillna(0)
sentiment_summary


## Visualise

In [None]:
social_df_cleaned['Polarity'] = social_df['Text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
social_df_cleaned['Subjectivity'] = social_df['Text'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)

# Categorize sentiment based on polarity
def categorizedSentiment(polarity):
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

social_df_cleaned['Sentiment_Category'] = social_df_cleaned['Polarity'].apply(categorizedSentiment)

textblob_path = 'categorizedSentiment.csv'
social_df_cleaned.to_csv(textblob_path, index=False)
textblob_summary = social_df_cleaned['Sentiment_Category'].value_counts()

textblob_summary


In [None]:
sns.set_theme(style="whitegrid")
plt.figure(figsize=(6, 4))
sns.countplot(data=social_df_cleaned, x='Sentiment_Category', hue='Sentiment_Category', palette='viridis', legend=False)
plt.title('Overal Sentiment Distribution', fontsize=14)
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('Categorical_Sentiments_Count.png')
plt.show()


In [None]:
sentiment_counts = social_df_cleaned['Sentiment_Category'].value_counts()
labels = sentiment_counts.index
sizes = sentiment_counts.values
colors = sns.color_palette('viridis', len(labels))

plt.figure(figsize=(5, 4))
wedges, texts, autotexts = plt.pie(
    sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors,
    wedgeprops=dict(width=0.4, edgecolor='w'), textprops={'fontsize': 10}
)

plt.title('Overall Sentiment Distribution', fontsize=14)
plt.axis('equal')
plt.tight_layout()
plt.savefig('Sentiment_Distribution_DonutChart.png')
plt.show()


In [None]:
platform_counts = social_df_cleaned['Platform'].value_counts()
plt.figure(figsize=(4, 5))
colors = sns.color_palette('viridis', len(platform_counts))
plt.pie(platform_counts.values, labels=platform_counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title('Platform Usage Distribution')
plt.axis('equal')
plt.tight_layout()
plt.savefig('Platform_Distribution_PieChart.png')
plt.show()


In [None]:
plt.figure(figsize=(8, 4))
sns.set_theme(style="whitegrid")

ax = sns.countplot(
    data=social_df_cleaned,
    x='Platform',
    hue='Sentiment_Category',
    palette='viridis'
)

for p in ax.patches:
    height = p.get_height()
    if height > 0:
        ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height),
                    ha='center', va='bottom', fontsize=9)

plt.title('Sentiment Distribution by Platform', fontsize=14)
plt.xlabel('Platform')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.tight_layout()
plt.savefig('Sentiment_by_Platform.png')
plt.show()


In [None]:
social_df_cleaned['Hashtags'] = (
    social_df_cleaned['Hashtags']
    .dropna()
    .apply(lambda x: [tag.strip().lower() for tag in str(x).split(',')])
)

exploded_df = social_df_cleaned.explode('Hashtags').dropna(subset=['Hashtags'])
exploded_df['Hashtags'] = exploded_df['Hashtags'].str.replace(r"[\'\"\[\]#]", '', regex=True)

platforms = exploded_df['Platform'].unique()

platform_hashtag_data = {}

for platform in platforms:
    platform_hashtags = exploded_df[exploded_df['Platform'] == platform]['Hashtags']
    hashtag_counts = Counter(platform_hashtags)
    top_10 = hashtag_counts.most_common(10)
    platform_hashtag_data[platform] = top_10

for platform, hashtags in platform_hashtag_data.items():
    if not hashtags:
        continue 
    labels, values = zip(*hashtags)
    colors = sns.color_palette("viridis", len(labels))

    plt.figure(figsize=(10, 4))
    bars = plt.barh(labels, values, color=colors)

    for bar in bars:
        width = bar.get_width()
        plt.text(width + 0.5, bar.get_y() + bar.get_height()/2, str(width), va='center', fontsize=9)

    plt.xlabel('Frequency')
    plt.title(f'Top 10 Hashtags on {platform}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(f'top_10_hashtags_{platform.lower().replace(" ", "_")}.png')
    plt.show()


In [None]:
# Group by Platform and Country and count
country_platform_counts = (
    social_df_cleaned
    .groupby(['Platform', 'Country'])
    .size()
    .reset_index(name='Count')
)

# Top 5 countries per platform
top_countries_per_platform = (
    country_platform_counts
    .sort_values(['Platform', 'Count'], ascending=[True, False])
    .groupby('Platform')
    .head(5)
)

# Plot
g = sns.catplot(
    data=top_countries_per_platform,
    x='Count', y='Country',
    hue='Country',
    col='Platform',
    kind='bar',
    col_wrap=3,
    height=4,
    aspect=1.2,
    palette='tab10',
    legend=False,
)

g.set_titles("{col_name}")
g.set_axis_labels("Number of Posts", "Country")
g.fig.suptitle("Top 5 Countries per Platform", fontsize=16, y=1.05)
plt.tight_layout()
plt.savefig('Top_Countries_by_Platform.png')
plt.show()


In [None]:
# Count the number of records per country
country_usage_counts = (
    social_df_cleaned['Country']
    .value_counts()
    .head(10)
    .reset_index()
)

country_usage_counts.columns = ['Country', 'UsageCount']


plt.figure(figsize=(10, 6))
sns.barplot(
    data=country_usage_counts,
    x='UsageCount', y='Country',
    hue='Country',
    palette='Blues_d',
    legend=False,
)
plt.title('Top 10 Countries by Social Media Usage')
plt.xlabel('Number of Records')
plt.ylabel('Country')
plt.tight_layout()
plt.show()


## Power BI Data

In [None]:

power_bi_df = pd.read_csv("categorizedSentiment.csv")
power_bi_df['Country'] = power_bi_df['Country'].str.strip().str.title()
power_data = power_bi_df.dropna(subset=['Hashtags']).copy()
power_bi_df['Hashtags'] = power_bi_df['Hashtags'].apply(ast.literal_eval)
df_exploded = power_bi_df.explode('Hashtags').reset_index(drop=True)

df_exploded.to_csv("power_bi_data.csv", index=False)

