In [2]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Exploration - Japanese Language Learning Apps Sentiment Analysis\n",
    "\n",
    "This notebook explores the sentiment analysis data from 6 Japanese language learning applications."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import json\n",
    "import os\n",
    "from pathlib import Path\n",
    "\n",
    "# Set up plotting style\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load all JSON files\n",
    "data_dir = Path('../data/raw')\n",
    "apps_data = {}\n",
    "\n",
    "for file_name in data_dir.glob('hasil_sentimen_*.json'):\n",
    "    app_name = file_name.stem.replace('hasil_sentimen_', '').replace('_agregat', '')\n",
    "    with open(file_name, 'r', encoding='utf-8') as f:\n",
    "        apps_data[app_name] = json.load(f)\n",
    "\n",
    "print(\"Loaded data for apps:\", list(apps_data.keys()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Data Overview"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create summary DataFrame\n",
    "summary_data = []\n",
    "\n",
    "for app_name, app_data in apps_data.items():\n",
    "    for feature, sentiment in app_data.items():\n",
    "        total = sentiment['positive'] + sentiment['negative']\n",
    "        percentage = (sentiment['positive'] / total * 100) if total > 0 else 0\n",
    "        \n",
    "        summary_data.append({\n",
    "            'App': app_name.title(),\n",
    "            'Feature': feature,\n",
    "            'Positive': sentiment['positive'],\n",
    "            'Negative': sentiment['negative'],\n",
    "            'Total': total,\n",
    "            'Positive_Percentage': percentage\n",
    "        })\n",
    "\n",
    "df = pd.DataFrame(summary_data)\n",
    "print(df.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Total reviews per app\n",
    "app_totals = df.groupby('App')['Total'].sum().sort_values(ascending=False)\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "app_totals.plot(kind='bar')\n",
    "plt.title('Total Reviews per Application')\n",
    "plt.xlabel('Application')\n",
    "plt.ylabel('Number of Reviews')\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Heatmap of positive reviews by feature\n",
    "pivot_df = df.pivot(index='App', columns='Feature', values='Positive')\n",
    "\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(pivot_df, annot=True, cmap='YlOrRd', fmt='d')\n",
    "plt.title('Positive Reviews Heatmap by App and Feature')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature comparison across apps\n",
    "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n",
    "features = ['kanji', 'kotoba', 'bunpou']\n",
    "\n",
    "for i, feature in enumerate(features):\n",
    "    feature_data = df[df['Feature'] == feature].sort_values('Positive', ascending=False)\n",
    "    axes[i].bar(feature_data['App'], feature_data['Positive'])\n",
    "    axes[i].set_title(f'{feature.title()} Feature Reviews')\n",
    "    axes[i].set_xlabel('Application')\n",
    "    axes[i].set_ylabel('Positive Reviews')\n",
    "    axes[i].tick_params(axis='x', rotation=45)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Statistical Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Basic statistics\n",
    "print(\"Dataset Summary:\")\n",
    "print(f\"Total Applications: {df['App'].nunique()}\")\n",
    "print(f\"Total Features: {df['Feature'].nunique()}\")\n",
    "print(f\"Total Reviews: {df['Total'].sum()}\")\n",
    "print(f\"Total Positive: {df['Positive'].sum()}\")\n",
    "print(f\"Total Negative: {df['Negative'].sum()}\")\n",
    "print(f\"Overall Positive Rate: {df['Positive'].sum() / df['Total'].sum() * 100:.2f}%\")\n",
    "\n",
    "print(\"\\nTop Apps by Total Reviews:\")\n",
    "print(app_totals)\n",
    "\n",
    "print(\"\\nFeature Popularity:\")\n",
    "feature_totals = df.groupby('Feature')['Total'].sum().sort_values(ascending=False)\n",
    "print(feature_totals)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

NameError: name 'null' is not defined