In [7]:
import os

companies = ['samsung', 'apple', 'nintendo']

# Loop through each company and remove empty files
for company in companies:
    company_path = os.path.join(base_dir, company)
    
    for date_folder in os.listdir(company_path):
        date_path = os.path.join(company_path, date_folder)
        
        for csv_file in os.listdir(date_path):
            if csv_file.endswith(".csv"):
                file_path = os.path.join(date_path, csv_file)
                
                # Check if the file is empty
                if os.stat(file_path).st_size <= 2:
                    os.remove(file_path)
                    print(f"Removed empty file: {file_path}")

Removed empty file: ../data_scraped/samsung\2025-04-14\1886751850436931953.csv
Removed empty file: ../data_scraped/samsung\2025-04-14\1894284344283599133.csv
Removed empty file: ../data_scraped/samsung\2025-04-14\1897497432537358622.csv


In [9]:
import pandas as pd
import os
from textblob import TextBlob

base_dir = "../data_scraped/"
companies = ['samsung', 'apple', 'nintendo']
data = []

for company in companies:
    company_path = os.path.join(base_dir, company)
    
    for date_folder in os.listdir(company_path):
        date_path = os.path.join(company_path, date_folder)
        
        for csv_file in os.listdir(date_path):
            if csv_file.endswith(".csv"):
                file_path = os.path.join(date_path, csv_file)
                df = pd.read_csv(file_path)
                
                if df.empty: continue

                # Main tweet is first row
                main_tweet = df.iloc[0]
                comments = df.iloc[1:]

                # Sentiment analysis
                main_sentiment = TextBlob(str(main_tweet['text'])).sentiment.polarity
                comment_sentiments = comments['text'].dropna().apply(lambda x: TextBlob(str(x)).sentiment.polarity)
                
                avg_comment_sentiment = comment_sentiments.mean() if not comment_sentiments.empty else None
                
                data.append({
                    "tweet_id": csv_file.replace(".csv", ""),
                    "company": company,
                    "date": date_folder,
                    "username": main_tweet['username'],
                    "text": main_tweet['text'],
                    "replies": main_tweet['replies'],
                    "reposts": main_tweet['reposts'],
                    "likes": main_tweet['likes'],
                    "bookmarks": main_tweet['bookmarks'],
                    "views": main_tweet['views'],
                    "comment_count": len(comments),
                    "total_engagement": main_tweet['replies'] + main_tweet['likes'] + main_tweet['bookmarks'] + main_tweet['reposts'],
                    "main_sentiment": main_sentiment,
                    "avg_comment_sentiment": avg_comment_sentiment
                })

# Create final DataFrame
final_df = pd.DataFrame(data)


In [10]:
# Group by username
user_df = final_df.groupby("username").agg({
    "text": "count",
    "main_sentiment": "mean",
    "likes": "sum",
    "replies": "sum",
    "bookmarks": "sum",
    "company": lambda x: x.mode()[0] if not x.mode().empty else None
}).reset_index()

user_df.columns = [
    "username", "total_comments", "avg_sentiment", 
    "total_likes", "total_replies", "total_bookmarks", "preferred_brand"
]

# Create custom engagement score
user_df["engagement_score"] = user_df["total_likes"] + user_df["total_replies"] + user_df["total_bookmarks"]


In [11]:
from sklearn.preprocessing import StandardScaler

features = ["total_comments", "avg_sentiment", "total_likes", "total_replies", "total_bookmarks", "engagement_score"]
X = user_df[features]
X_scaled = StandardScaler().fit_transform(X)
