In [None]:
import json
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
)
import torch

# Load pre-trained overall sentiment analysis model and tokenizer
overall_model_name = (
    "nlptown/bert-base-multilingual-uncased-sentiment"  # Overall sentiment model
)
overall_tokenizer = AutoTokenizer.from_pretrained(overall_model_name)
overall_model = AutoModelForSequenceClassification.from_pretrained(overall_model_name)

# Load pre-trained ABSA model and tokenizer
absa_model_name = "absa/classifier-rest-0.2"  # Aspect-based sentiment analysis model
absa_tokenizer = AutoTokenizer.from_pretrained(absa_model_name)
absa_model = AutoModelForTokenClassification.from_pretrained(absa_model_name)


# Function to preprocess reviews for overall sentiment analysis
def preprocess_for_overall_sentiment(review_text, max_length=512):
    tokenized_input = overall_tokenizer(
        review_text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    return tokenized_input


# Function to preprocess reviews for aspect-based sentiment analysis
def preprocess_for_absa(review_text, max_length=512):
    tokenized_input = absa_tokenizer(
        review_text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    return tokenized_input


# Predict overall sentiment for a review
def predict_overall_sentiment(tokenized_input):
    with torch.no_grad():
        outputs = overall_model(**tokenized_input)
    logits = outputs.logits
    predicted_class = torch.argmax(
        logits, dim=1
    ).item()  # Class (e.g., 0=negative, 1=neutral, 2=positive)
    return predicted_class


# Predict aspects and their sentiment from the review using ABSA
def predict_aspects_and_sentiments(tokenized_input):
    with torch.no_grad():
        outputs = absa_model(**tokenized_input)
    logits = outputs.logits
    predicted_aspects = torch.argmax(
        logits, dim=2
    )  # Get the aspect sentiment classification
    return predicted_aspects


# Load review data from JSON file
def load_reviews_from_json(json_file):
    with open(json_file, "r", encode="utf-8") as f:
        data = json.load(f)
    return data["reviews"]


# Process reviews for both overall and aspect-based sentiment
def process_reviews(json_file):
    reviews_data = load_reviews_from_json(json_file)
    results = []

    for review in reviews_data:
        review_text = review.get("content")  # Adjust based on your JSON structure

        # Overall sentiment analysis
        tokenized_review_overall = preprocess_for_overall_sentiment(review_text)
        overall_sentiment = predict_overall_sentiment(tokenized_review_overall)

        # Aspect-based sentiment analysis
        tokenized_review_absa = preprocess_for_absa(review_text)
        aspects_sentiments = predict_aspects_and_sentiments(tokenized_review_absa)

        # Store the review text, overall sentiment, and aspect sentiments
        results.append(
            {
                "reviewId": review.get("reviewId"),
                "reviewText": review_text,
                "overallSentiment": overall_sentiment,  # Overall sentiment: 0=negative, 1=neutral, 2=positive
                "aspectsSentiments": aspects_sentiments,  # Aspect-sentiment pairs
            }
        )

    return results


# Convert the results to a DataFrame and save to CSV
def save_to_csv(results, output_csv):
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Results saved to {output_csv}")


# Main function to load, process, and save results
def main(json_file, output_csv):
    results = process_reviews(json_file)
    save_to_csv(results, output_csv)


# Example usage
json_file = "Habitica:GamifyYourTasks.json"  # Path to your JSON file containing reviews
output_csv = "processed_reviews_with_overall_and_aspects.csv"  # Output CSV file path

main(json_file, output_csv)

In [6]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import regex as re
import emoji


def clean_text(text):
    """Basic cleaning of text while preserving the multilingual characters"""
    if not isinstance(text, str):
        return ""

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    text = emoji.demojize(text)
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.UNICODE)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


# Load the pre-trained model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Load reviews data from a JSON file

In [13]:
def perform_sentimental_analysis(data, app_name):
    # Preprocess reviews and perform sentiment analysis
    results = []
    for review in data["reviews"]:
        review_id = review.get("reviewId")
        content = review.get("content")

        if content:  # Only analyze reviews that have content
            content = clean_text(content)
            sentiment = sentiment_analyzer(content)[0]  # Perform sentiment analysis
            sentiment_score = sentiment[
                "label"
            ]  # The model gives a label like "1 star", "2 stars", etc.

            # Store the result as a dictionary
            results.append(
                {
                    "reviewId": review_id,
                    "content": content,
                    "sentimentScore": sentiment_score,
                }
            )

    # Convert the results to a DataFrame
    df = pd.DataFrame(results)

    # Save the DataFrame to a CSV file
    df.to_csv(f"data/processed_data/play_store/{app_name}.csv", index=False)

    print(
        f"Sentiment analysis results for {app_name} saved to 'reviews_sentiment.csv'."
    )

In [15]:
from pathlib import Path

directory = Path("data/raw_data/playstore")
for file_path in directory.iterdir():
    if file_path.is_file():
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        perform_sentimental_analysis(data, file_path.stem)
        break

Sentiment analysis results for HabitShare-HabitTracker saved to 'reviews_sentiment.csv'.


In [4]:
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import regex as re
import emoji
from pathlib import Path
from tqdm import tqdm
import multiprocessing
from functools import partial
import numpy as np


class ReviewDataset(Dataset):
    def __init__(self, reviews, tokenizer, max_length=512):
        self.reviews = reviews
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return self.reviews[idx]


def clean_text(text):
    """Basic cleaning of text while preserving the multilingual characters"""
    if not isinstance(text, str):
        return ""

    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    text = emoji.demojize(text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.UNICODE)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


def clean_reviews_batch(reviews):
    """Clean a batch of reviews in parallel"""
    with multiprocessing.Pool() as pool:
        cleaned_reviews = pool.map(clean_text, reviews)
    return cleaned_reviews


def collate_batch(batch, tokenizer, max_length=512):
    """Collate batch of reviews into tensor format"""
    return tokenizer(
        batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt"
    )


def process_batch(batch, model, device):
    """Process a batch of reviews and return sentiment scores"""
    with torch.no_grad():
        outputs = model(**{k: v.to(device) for k, v in batch.items()})
        predictions = torch.softmax(outputs.logits, dim=1)
        return predictions.cpu().numpy()


def perform_sentiment_analysis(data, app_name, model, tokenizer, device, batch_size=32):
    """Perform sentiment analysis on reviews in batches"""
    # Extract review content and IDs
    reviews = []
    review_ids = []
    for review in data["reviews"]:
        if review.get("content"):
            reviews.append(review["content"])
            review_ids.append(review["reviewId"])

    if not reviews:
        print(f"No reviews found for {app_name}")
        return

    # Clean reviews in parallel
    print(f"Cleaning reviews for {app_name}...")
    cleaned_reviews = clean_reviews_batch(reviews)

    # Create dataset and dataloader
    dataset = ReviewDataset(cleaned_reviews, tokenizer)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        collate_fn=partial(collate_batch, tokenizer=tokenizer),
        num_workers=4,
    )

    # Process batches
    all_predictions = []
    print(f"Processing reviews for {app_name}...")
    for batch in tqdm(dataloader):
        predictions = process_batch(batch, model, device)
        all_predictions.extend(predictions)

    # Convert predictions to sentiment scores (1-5)
    sentiment_scores = np.argmax(all_predictions, axis=1) + 1

    # Create DataFrame with results
    results_df = pd.DataFrame(
        {
            "reviewId": review_ids,
            "content": cleaned_reviews,
            "sentimentScore": sentiment_scores,
        }
    )

    # Save results
    output_path = Path(f"data/processed_data/play_store/{app_name}.csv")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(output_path, index=False)

    print(f"Sentiment analysis results for {app_name} saved to {output_path}")


def main():
    # Set up device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Load model and tokenizer
    print("Loading model and tokenizer...")
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    model = model.to(device)
    model.eval()

    # Process files
    directory = Path("data/raw_data/playstore")
    for file_path in directory.iterdir():
        if file_path.is_file():
            print(f"\nProcessing {file_path.stem}...")
            if Path(f'data/processed_data/play_store/{file_path.stem}.csv').exists():
                continue
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                perform_sentiment_analysis(
                    data, file_path.stem, model, tokenizer, device
                )
            except Exception as e:
                print(f"Error processing {file_path.stem}: {str(e)}")

if __name__ == "__main__":
    main()

Using device: cpu
Loading model and tokenizer...

Processing HabitShare-HabitTracker...

Processing Dreamfora:AIGoalSetting...

Processing Sectograph.Day&Timeplanner...

Processing NCalendar-Simpleplanner...

Processing Productive-Habittracker...

Processing HabitNowDailyRoutinePlanner...

Processing TickTick:ToDoList&Calendar...

Processing Engross:FocusTimer&To-Do...

Processing SaveMyTime-TimeTracker...

Processing Habitica:GamifyYourTasks...

Processing TheFor:HabitTracker...

Processing Prosper-DailyPlannerTodo...

Processing WaterDo:ToDoList&Schedule...

Processing MinimalistPomodoroTimer...

Processing Gratitude:Self-CareJournal...

Processing Forest:FocusforProductivity...

Processing Me+DailyRoutinePlanner...

Processing MoodTracker:Self-CareHabits...

Processing FabulousDailyRoutinePlanner...

Processing Habitify:DailyHabitTracker...

Processing ColorNoteNotepadNotesTodo...
Cleaning reviews for ColorNoteNotepadNotesTodo...
Processing reviews for ColorNoteNotepadNotesTodo...


 89%|████████▉ | 13152/14796 [7:55:42<36:28,  1.33s/it]       

In [3]:
import pandas as pd
from pathlib import Path
import os

# import shutil


def organize_reviews_by_sentiment(processed_data_dir="data/processed_data/play_store"):
    """
    Organize reviews into separate files based on sentiment scores for each app
    """
    processed_dir = Path(processed_data_dir)
    destination_dir = Path("organized_data")
    # Iterate through each app's CSV file
    for csv_file in processed_dir.glob("*.csv"):
        app_name = csv_file.stem
        print(f"\nProcessing {app_name}...")

        # Create app directory
        app_dir = destination_dir / app_name
        app_dir.mkdir(parents=True, exist_ok=True)

        # Read the CSV file
        try:
            df = pd.read_csv(csv_file)

            # Create summary statistics
            summary_stats = {
                "total_reviews": len(df),
                "sentiment_distribution": df["sentimentScore"].value_counts().to_dict(),
                "average_sentiment": df["sentimentScore"].mean(),
            }

            # Split reviews by sentiment score and save to separate files
            for sentiment in range(1, 6):
                sentiment_df = df[df["sentimentScore"] == sentiment]

                if not sentiment_df.empty:
                    # Save to sentiment-specific file
                    output_file = app_dir / f"{sentiment}_star_reviews.csv"
                    sentiment_df.to_csv(output_file, index=False)
                    print(
                        f"Saved {len(sentiment_df)} {sentiment}-star reviews to {output_file}"
                    )

            # Save summary statistics
            summary_file = app_dir / "sentiment_summary.txt"
            with open(summary_file, "w") as f:
                f.write(f"Sentiment Analysis Summary for {app_name}\n")
                f.write("=" * 50 + "\n\n")
                f.write(f"Total Reviews: {summary_stats['total_reviews']}\n\n")
                f.write("Sentiment Distribution:\n")
                for stars, count in sorted(
                    summary_stats["sentiment_distribution"].items()
                ):
                    percentage = (count / summary_stats["total_reviews"]) * 100
                    f.write(f"{stars} stars: {count} reviews ({percentage:.1f}%)\n")
                f.write(
                    f"\nAverage Sentiment Score: {summary_stats['average_sentiment']:.2f}"
                )

            
        except Exception as e:
            print(f"Error processing {app_name}: {str(e)}")
            continue


def main():
    print("Starting to organize reviews by sentiment...")
    organize_reviews_by_sentiment()
    print("\nProcess completed!")


if __name__ == "__main__":
    main()

Starting to organize reviews by sentiment...

Processing Sectograph.Day&Timeplanner...
Saved 466 1-star reviews to organized_data/Sectograph.Day&Timeplanner/1_star_reviews.csv
Saved 346 2-star reviews to organized_data/Sectograph.Day&Timeplanner/2_star_reviews.csv
Saved 817 3-star reviews to organized_data/Sectograph.Day&Timeplanner/3_star_reviews.csv
Saved 2371 4-star reviews to organized_data/Sectograph.Day&Timeplanner/4_star_reviews.csv
Saved 5026 5-star reviews to organized_data/Sectograph.Day&Timeplanner/5_star_reviews.csv

Processing TheFor:HabitTracker...
Saved 23 1-star reviews to organized_data/TheFor:HabitTracker/1_star_reviews.csv
Saved 21 2-star reviews to organized_data/TheFor:HabitTracker/2_star_reviews.csv
Saved 51 3-star reviews to organized_data/TheFor:HabitTracker/3_star_reviews.csv
Saved 101 4-star reviews to organized_data/TheFor:HabitTracker/4_star_reviews.csv
Saved 238 5-star reviews to organized_data/TheFor:HabitTracker/5_star_reviews.csv

Processing WaterDo:ToDo

In [None]:
import os
import time
import shutil
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import pandas as pd
from pathlib import Path
from datetime import datetime


class SentimentFileHandler(FileSystemEventHandler):
    def __init__(self, source_dir, output_base_dir):
        self.source_dir = source_dir
        self.output_base_dir = output_base_dir

        # # Create output directories if they don't exist
        # self.directories = {
        #     'positive': os.path.join(output_base_dir, 'positive_sentiment'),
        #     'negative': os.path.join(output_base_dir, 'negative_sentiment'),
        #     'neutral': os.path.join(output_base_dir, 'neutral_sentiment'),
        #     'processed': os.path.join(output_base_dir, 'processed_files'),
        #     'logs': os.path.join(output_base_dir, 'logs')
        # }

        # for dir_path in self.directories.values():
        #     os.makedirs(dir_path, exist_ok=True)

    def on_created(self, event):
        if event.is_directory or not event.src_path.endswith(".csv"):
            return

        # Wait a short moment to ensure file is completely written
        time.sleep(1)

        try:
            # Process the newly created file
            self.organize_reviews_by_sentiment(event.src_path)
        except Exception as e:
            self.log_error(f"Error processing {event.src_path}: {str(e)}")

    def organize_reviews_by_sentiment(self, file_path):
        """
        Organize reviews into separate files based on sentiment scores for each app
        """
        csv_file = Path(file_path)
        destination_dir = Path(self.output_base_dir)
        # Iterate through each app's CSV file
        if csv_file.is_file:
            app_name = csv_file.stem
            print(f"\nProcessing {app_name}...")

            # Create app directory
            app_dir = destination_dir / app_name
            app_dir.mkdir(parents=True, exist_ok=True)

            # Read the CSV file
            try:
                df = pd.read_csv(csv_file)

                # Create summary statistics
                summary_stats = {
                    "total_reviews": len(df),
                    "sentiment_distribution": df["sentimentScore"]
                    .value_counts()
                    .to_dict(),
                    "average_sentiment": df["sentimentScore"].mean(),
                }

                # Split reviews by sentiment score and save to separate files
                for sentiment in range(1, 6):
                    sentiment_df = df[df["sentimentScore"] == sentiment]

                    if not sentiment_df.empty:
                        # Save to sentiment-specific file
                        output_file = app_dir / f"{sentiment}_star_reviews.csv"
                        sentiment_df.to_csv(output_file, index=False)
                        print(
                            f"Saved {len(sentiment_df)} {sentiment}-star reviews to {output_file}"
                        )

                # Save summary statistics
                summary_file = app_dir / "sentiment_summary.txt"
                with open(summary_file, "w") as f:
                    f.write(f"Sentiment Analysis Summary for {app_name}\n")
                    f.write("=" * 50 + "\n\n")
                    f.write(f"Total Reviews: {summary_stats['total_reviews']}\n\n")
                    f.write("Sentiment Distribution:\n")
                    for stars, count in sorted(
                        summary_stats["sentiment_distribution"].items()
                    ):
                        percentage = (count / summary_stats["total_reviews"]) * 100
                        f.write(f"{stars} stars: {count} reviews ({percentage:.1f}%)\n")
                    f.write(
                        f"\nAverage Sentiment Score: {summary_stats['average_sentiment']:.2f}"
                    )
                    # Log successful processing
                self.log_success(f"Processed and organized {filename}")

            except Exception as e:
                print(f"Error processing {app_name}: {str(e)}")

    # def process_sentiment_file(self, file_path):
    #     try:
    #         # Read the sentiment analysis results
    #         df = pd.read_csv(file_path)

    #         # Get the base filename
    #         filename = os.path.basename(file_path)

    #         # Determine overall sentiment (you may need to adjust this logic)
    #         avg_sentiment = df["sentiment"].mean() if "sentiment" in df.columns else 0

    #         # Choose destination based on average sentiment
    #         if avg_sentiment > 0.1:
    #             dest_dir = self.directories["positive"]
    #         elif avg_sentiment < -0.1:
    #             dest_dir = self.directories["negative"]
    #         else:
    #             dest_dir = self.directories["neutral"]

    #         # Move file to appropriate directory
    #         dest_path = os.path.join(dest_dir, filename)
    #         shutil.move(file_path, dest_path)

    #         # Create a copy in processed files directory
    #         processed_path = os.path.join(self.directories["processed"], filename)
    #         shutil.copy2(dest_path, processed_path)

    #         # Log successful processing
    #         self.log_success(f"Processed and organized {filename}")

    #     except Exception as e:
    #         raise Exception(f"Failed to process {file_path}: {str(e)}")

    def log_success(self, message):
        self._log_message("SUCCESS", message)

    def log_error(self, message):
        self._log_message("ERROR", message)

    def _log_message(self, level, message):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_file = os.path.join(self.directories["logs"], "file_organization.log")
        with open(log_file, "a") as f:
            f.write(f"[{timestamp}] {level}: {message}\n")


def start_file_monitoring(source_dir, output_base_dir):
    """
    Start monitoring the source directory for new sentiment analysis files.

    Args:
        source_dir (str): Directory to monitor for new files
        output_base_dir (str): Base directory for organized output
    """
    event_handler = SentimentFileHandler(source_dir, output_base_dir)
    observer = Observer()
    observer.schedule(event_handler, source_dir, recursive=False)
    observer.start()

    print(f"Started monitoring {source_dir}")
    print(f"Organized files will be saved in {output_base_dir}")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
        print("\nStopped monitoring.")

    observer.join()


if __name__ == "__main__":
    # Replace these with your actual directories
    SOURCE_DIR = "data/processed_data/play_store"  # Directory where sentiment analysis saves results
    OUTPUT_BASE_DIR = "organized_results"  # Base directory for organized files

    start_file_monitoring(SOURCE_DIR, OUTPUT_BASE_DIR)
