In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Análise Exploratória dos Dados - Influenciadores do Instagram\n",
    "\n",
    "Este notebook contém uma análise detalhada do dataset de influenciadores do Instagram."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "%matplotlib inline\n",
    "plt.style.use('seaborn')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Carregamento e Visão Geral dos Dados"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Carregar dados\n",
    "df = pd.read_csv('../data/influencers.csv')\n",
    "\n",
    "# Visão geral\n",
    "print(\"Dimensões do dataset:\", df.shape)\n",
    "print(\"\\nPrimeiras linhas:\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Análise das Features Numéricas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def convert_to_numeric(value):\n",
    "    if isinstance(value, (int, float)):\n",
    "        return value\n",
    "    \n",
    "    if pd.isna(value):\n",
    "        return np.nan\n",
    "        \n",
    "    value = str(value).lower().strip()\n",
    "    multipliers = {'k': 1000, 'm': 1000000, 'b': 1000000000}\n",
    "    \n",
    "    for suffix, multiplier in multipliers.items():\n",
    "        if value.endswith(suffix):\n",
    "            try:\n",
    "                return float(value[:-1]) * multiplier\n",
    "            except ValueError:\n",
    "                return np.nan\n",
    "    try:\n",
    "        return float(value)\n",
    "    except ValueError:\n",
    "        return np.nan\n",
    "\n",
    "# Converter colunas\n",
    "numeric_cols = ['posts', 'followers', 'avg_likes', 'total_likes']\n",
    "for col in numeric_cols:\n",
    "    df[col] = df[col].apply(convert_to_numeric)\n",
    "\n",
    "# Converter taxa de engajamento\n",
    "df['60_day_eng_rate'] = df['60_day_eng_rate'].str.rstrip('%').astype(float)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Análise de Correlações"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Matriz de correlação\n",
    "correlation_vars = ['influence_score', 'posts', 'followers', 'avg_likes', '60_day_eng_rate']\n",
    "correlation = df[correlation_vars].corr()\n",
    "\n",
    "plt.figure(figsize=(10, 8))\n",
    "sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)\n",
    "plt.title('Matriz de Correlação')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Distribuição das Variáveis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Plots de distribuição\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 10))\n",
    "axes = axes.ravel()\n",
    "\n",
    "for idx, col in enumerate(['followers', 'posts', 'avg_likes', '60_day_eng_rate']):\n",
    "    sns.histplot(data=df, x=col, ax=axes[idx])\n",
    "    axes[idx].set_title(f'Distribuição de {col}')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Análise por País"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Top 10 países por número de influenciadores\n",
    "plt.figure(figsize=(12, 6))\n",
    "df['country'].value_counts().head(10).plot(kind='bar')\n",
    "plt.title('Top 10 Países com Mais Influenciadores')\n",
    "plt.xlabel('País')\n",
    "plt.ylabel('Número de Influenciadores')\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Média de engajamento por país\n",
    "engagement_by_country = df.groupby('country')['60_day_eng_rate'].mean().sort_values(ascending=False)\n",
    "print(\"\\nMédia de engajamento por país (top 10):\")\n",
    "print(engagement_by_country.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Análise de Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Boxplots para identificação de outliers\n",
    "plt.figure(figsize=(15, 5))\n",
    "df[['influence_score', 'posts', 'followers', 'avg_likes']].boxplot()\n",
    "plt.title('Boxplot das Features Principais')\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 }
}