In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model Development - Sentiment Analysis for Japanese Learning Apps\n",
    "\n",
    "This notebook demonstrates the development and training of sentiment analysis models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "from sklearn.pipeline import Pipeline\n",
    "import joblib\n",
    "import re\n",
    "import string\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Create Sample Training Data\n",
    "\n",
    "Since we don't have raw review text, we'll create sample data for demonstration."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sample review data for training\n",
    "sample_reviews = [\n",
    "    (\"Great app for learning kanji characters\", \"positive\"),\n",
    "    (\"Love the vocabulary practice feature\", \"positive\"),\n",
    "    (\"Grammar exercises are very helpful\", \"positive\"),\n",
    "    (\"This app is amazing for Japanese study\", \"positive\"),\n",
    "    (\"Perfect for JLPT preparation\", \"positive\"),\n",
    "    (\"User interface is clean and intuitive\", \"positive\"),\n",
    "    (\"Excellent kanji recognition system\", \"positive\"),\n",
    "    (\"Good vocabulary builder\", \"positive\"),\n",
    "    (\"App crashes frequently\", \"negative\"),\n",
    "    (\"Too many ads interrupt learning\", \"negative\"),\n",
    "    (\"Interface is confusing\", \"negative\"),\n",
    "    (\"Not enough grammar explanations\", \"negative\"),\n",
    "    (\"Poor audio quality\", \"negative\"),\n",
    "    (\"Subscription too expensive\", \"negative\")\n",
    "]\n",
    "\n",
    "# Convert to DataFrame\n",
    "df_sample = pd.DataFrame(sample_reviews, columns=['review', 'sentiment'])\n",
    "print(f\"Sample dataset shape: {df_sample.shape}\")\n",
    "print(df_sample.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Text Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess_text(text):\n",
    "    \"\"\"Preprocess text for sentiment analysis\"\"\"\n",
    "    # Convert to lowercase\n",
    "    text = text.lower()\n",
    "    \n",
    "    # Remove punctuation\n",
    "    text = text.translate(str.maketrans('', '', string.punctuation))\n",
    "    \n",
    "    # Remove extra whitespace\n",
    "    text = ' '.join(text.split())\n",
    "    \n",
    "    return text\n",
    "\n",
    "# Apply preprocessing\n",
    "df_sample['processed_review'] = df_sample['review'].apply(preprocess_text)\n",
    "print(df_sample[['review', 'processed_review', 'sentiment']].head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Model Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split data\n",
    "X = df_sample['processed_review']\n",
    "y = df_sample['sentiment']\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.3, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "print(f\"Training set size: {len(X_train)}\")\n",
    "print(f\"Test set size: {len(X_test)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create and train models\n",
    "models = {\n",
    "    'Logistic Regression': Pipeline([\n",
    "        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),\n",
    "        ('classifier', LogisticRegression(random_state=42))\n",
    "    ]),\n",
    "    'Naive Bayes': Pipeline([\n",
    "        ('tfidf', TfidfVectorizer(max_features=1000, stop_words='english')),\n",
    "        ('classifier', MultinomialNB())\n",
    "    ])\n",
    "}\n",
    "\n",
    "# Train and evaluate models\n",
    "results = {}\n",
    "for name, model in models.items():\n",
    "    print(f\"\\nTraining {name}...\")\n",
    "    model.fit(X_train, y_train)\n",
    "    \n",
    "    # Predictions\n",
    "    y_pred = model.predict(X_test)\n",
    "    \n",
    "    # Store results\n",
    "    results[name] = {\n",
    "        'model': model,\n",
    "        'predictions': y_pred,\n",
    "        'accuracy': model.score(X_test, y_test)\n",
    "    }\n",
    "    \n",
    "    print(f\"Accuracy: {results[name]['accuracy']:.3f}\")\n",
    "    print(\"Classification Report:\")\n",
    "    print(classification_report(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Model Selection and Saving"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select best model\n",
    "best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])\n",
    "best_model = results[best_model_name]['model']\n",
    "\n",
    "print(f\"Best model: {best_model_name} (Accuracy: {results[best_model_name]['accuracy']:.3f})\")\n",
    "\n",
    "# Save the best model\n",
    "model_path = '../models/sentiment_model.pkl'\n",
    "joblib.dump(best_model, model_path)\n",
    "print(f\"Model saved to {model_path}\")\n",
    "\n",
    "# Also save the vectorizer separately for easier access\n",
    "vectorizer = best_model.named_steps['tfidf']\n",
    "vectorizer_path = '../models/vectorizer.pkl'\n",
    "joblib.dump(vectorizer, vectorizer_path)\n",
    "print(f\"Vectorizer saved to {vectorizer_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Feature Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract feature importance for Logistic Regression\n",
    "if 'Logistic Regression' in results:\n",
    "    lr_model = results['Logistic Regression']['model']\n",
    "    vectorizer = lr_model.named_steps['tfidf']\n",
    "    classifier = lr_model.named_steps['classifier']\n",
    "    \n",
    "    # Get feature names and coefficients\n",
    "    feature_names = vectorizer.get_feature_names_out()\n",
    "    coefficients = classifier.coef_[0]\n",
    "    \n",
    "    # Create feature importance DataFrame\n",
    "    feature_df = pd.DataFrame({\n",
    "        'feature': feature_names,\n",
    "        'coefficient': coefficients\n",
    "    }).sort_values('coefficient', key=abs, ascending=False)\n",
    "    \n",
    "    print(\"Top 10 Most Important Features:\")\n",
    "    print(feature_df.head(10))\n",
    "    \n",
    "    # Plot feature importance\n",
    "    plt.figure(figsize=(12, 6))\n",
    "    top_features = feature_df.head(20)\n",
    "    colors = ['red' if x < 0 else 'green' for x in top_features['coefficient']]\n",
    "    \n",
    "    plt.barh(range(len(top_features)), top_features['coefficient'], color=colors)\n",
    "    plt.yticks(range(len(top_features)), top_features['feature'])\n",
    "    plt.xlabel('Coefficient Value')\n",
    "    plt.title('Top 20 Feature Importance (Logistic Regression)')\n",
    "    plt.axvline(x=0, color='black', linestyle='--', alpha=0.7)\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Test Model Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test with some example reviews\n",
    "test_reviews = [\n",
    "    \"This kanji learning app is fantastic!\",\n",
    "    \"The vocabulary exercises are really helpful\",\n",
    "    \"Grammar section needs improvement\",\n",
    "    \"App is slow and buggy\",\n",
    "    \"Perfect for JLPT study\"\n",
    "]\n",
    "\n",
    "print(\"Test Predictions:\")\n",
    "for review in test_reviews:\n",
    "    processed_review = preprocess_text(review)\n",
    "    prediction = best_model.predict([processed_review])[0]\n",
    "    probability = best_model.predict_proba([processed_review])[0]\n",
    "    \n",
    "    print(f\"Review: '{review}'\")\n",
    "    print(f\"Prediction: {prediction}\")\n",
    "    print(f\"Confidence: {max(probability):.3f}\")\n",
    "    print(\"-\" * 50)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

NameError: name 'null' is not defined