Using VADER (Valence Aware Dictionary and sEntiment Reasoner) we will identify which sports generate the most positive or negative discussions from the Web Scrapped Data On BlueSky

In [1]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import download
import os

In [2]:
# Download the VADER lexicon required for SentimentIntensityAnalyzer
download('vader_lexicon')

# Initialize VADER
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Jair4\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# Define folder paths
input_folder = "DataMiningProcessing"  
output_folder = "Processed"

# if output folder doesn't exist, create one
os.makedirs(output_folder, exist_ok=True)

In [4]:
def process_csv(file_path, output_path):

    # Load the file path
    pf  = pd.read_csv(file_path)

    # sia.polarity_scores creates a dictorary for the values pos, neg, neu, and compound 
    #   ex: {'neg': 0.0, 'neu': 0.182, 'pos': 0.818, 'compound': 0.6696}

    # Saving and Labeling the Compound Value
    # Apply Setiment Score to each string of CSV Column
    pf['Sentiment_Score'] = pf['content'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
    pf['Sentiment_Label'] = pf['Sentiment_Score'].apply(
        lambda score: 'Positive' if score > 0.05 else 'Negative' if score < -0.05 else 'Neutral'
    )

    # Save the new csv file with their scores
    pf.to_csv(output_path, index=False)
    print(f"Processed and saved: {output_path}")

In [5]:
# Perform sia.process_csv on all CSV files in the DataMiningProcess folder
for file_name in os.listdir(input_folder):
    if file_name.endswith('.csv'):  # Check if the file is a CSV
        input_path = os.path.join(input_folder, file_name)
        output_path = os.path.join(output_folder, file_name)

        # get rid of the 'bluesky_posts_' to get only the sports name
        # later to load into the result 
        output_file_name = file_name.replace("bluesky_posts_", "")
        output_path = os.path.join(output_folder, output_file_name)

        print(f"Processing file: {file_name}")
        process_csv(input_path, output_path)

Processing file: bluesky_posts_Archery.csv
Processed and saved: Processed\Archery.csv
Processing file: bluesky_posts_Artistic Gymnastics.csv
Processed and saved: Processed\Artistic Gymnastics.csv
Processing file: bluesky_posts_Artistic Swimming.csv
Processed and saved: Processed\Artistic Swimming.csv
Processing file: bluesky_posts_Athletics.csv
Processed and saved: Processed\Athletics.csv
Processing file: bluesky_posts_Badminton.csv
Processed and saved: Processed\Badminton.csv
Processing file: bluesky_posts_Basketball 3x3.csv
Processed and saved: Processed\Basketball 3x3.csv
Processing file: bluesky_posts_Basketball.csv
Processed and saved: Processed\Basketball.csv
Processing file: bluesky_posts_Beach Volleyball.csv
Processed and saved: Processed\Beach Volleyball.csv
Processing file: bluesky_posts_Boxing.csv
Processed and saved: Processed\Boxing.csv
Processing file: bluesky_posts_Breaking.csv
Processed and saved: Processed\Breaking.csv
Processing file: bluesky_posts_Canoe Slalom.csv
Pr

In [None]:
import pandas as pd
import os

#Store the Results
results = []
processed_folder = "Processed"

# Define the output folder, one directory before 'Processed'
output_folder = os.path.abspath(os.path.join(processed_folder, ".."))
output_file = os.path.join(output_folder, "sports_sentiment_matrix.xlsx") 

# Loop through the processed folder and analyze the average sentiment scores
for file_name in os.listdir(processed_folder):
    if file_name.endswith('.csv'):
        file_path = os.path.join(processed_folder, file_name)
        pf = pd.read_csv(file_path)
        
        # Calculate the average sentiment score on sentiment column
        avg_sentiment_score = pf['Sentiment_Score'].mean()A
        
        # Determine the sentiment score for the avergage
        avg_sentiment_label = "Positive" if avg_sentiment_score > 0.05 else "Negative" if avg_sentiment_score < -0.05 else "Neutral"
        
        # Extract the sport name from the file name
        sport_name = file_name.replace(".csv", "")
        
        # Append the data to the list
        results.append({
            'Sport': sport_name,
            'Average Sentiment Score': avg_sentiment_score,
            'Sentiment Label': avg_sentiment_label
        })

# Ensure the output folder exists (this will point to one directory above 'Processed')
os.makedirs(output_folder, exist_ok=True)


# Convert the list of data into a pandas DataFrame
# Save the DataFrame to an Excel file in the parent folder
sentiment_pf = pd.DataFrame(results)
sentiment_pf.to_excel(output_file, index=False, engine='openpyxl')

print(f"Sentiment analysis results saved to {output_file}")

Sentiment analysis results saved to c:\Users\Jair4\Desktop\Data_Mining_Project\Data-Processing\sports_sentiment_matrix.xlsx
