# DATA EXPLORATION

We will explore the data to detect `fake users`.

# 1. Loading packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 2. Data analysis

In [None]:
# Loading the file to use for training a futur model
df = pd.read_csv('../data/fake_users.csv')

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
df.UserId.nunique()
df.Event.nunique()
df.Category.nunique()

In [None]:
df[df.Fake == 0]["UserId"].nunique()
df[df.Fake == 1]["UserId"].nunique()

<div class="alert alert-block alert-info">
    <b>NOTE:</b>
    <p>
        We can observe that the dataset has in total:
        <ul>
            <li>5300 unique UserIds</li>
            <li>5 unique event types</li>
            <li>6 unique category types</li>
        </ul>
    All users are tagged as <b>Fake</b> or <b>no-Fake</b>. If some users would be tagged with both labels, we should remove them for the training set, since we do not have enough information to keep them.
    </p>
</div>

## 2.1. Fake vs No-Fake

In [None]:
df_Fake = df[df.Fake == 1]
df_no_Fake = df[df.Fake == 0]

In [None]:
df_no_Fake.Event.value_counts()
df_Fake.Event.value_counts()

<div class="alert alert-block alert-info">
    <b>NOTE:</b>
    <p>
        We can observe that <b>Fake users</b> tend to perform some event types such as <i>send_email</i> and <i>click_ad</i> compared to no-Fake users. This information can help us to perform the feature engineering to train a classification model.
    </p>
</div>

In [None]:
df_no_Fake.Category.value_counts()
df_Fake.Category.value_counts()

In [None]:
df_grouped_no_fake = df_no_Fake.groupby("UserId").agg(count_category=("Category", "nunique"))
df_grouped_fake = df_Fake.groupby("UserId").agg(count_category=("Category", "nunique"))

In [None]:
df_grouped_no_fake.count_category.plot.hist()
df_grouped_fake.count_category.plot.hist()
plt.title("Category Histogram")
plt.legend(["No-Fake", "Fake"])

<div class="alert alert-block alert-info">
    <b>NOTE:</b>
    <p>
        We can observe that <b>Fake users</b> tend to interact with more categories compared to <b>no-Fake users</b>. In general, Fake users interact with at least <b>5 categories</b>.
    </p>
</div>

We will observe if we can separe both type of users using only 2 featues: count of events and number of unique categories the user interacts with.

In [None]:
df_grouped = df.groupby(["UserId", "Fake"]).agg(count_event=("Event", "count"), count_category=("Category", "nunique"))

In [None]:
df_grouped = df_grouped.reset_index()

In [None]:
plt.scatter(df_grouped["count_event"], df_grouped["count_category"], c=df_grouped["Fake"])

In [None]:
df_grouped[df_grouped.count_event <= 21]["Fake"].value_counts()
df_grouped[df_grouped.count_event > 21]["Fake"].value_counts()

<div class="alert alert-block alert-info">
    <b>NOTE:</b>
    <p>
        We can observe that <b>Fake users</b> and <b>No-Fake users</b> can be separated using just 2 fetures. Let's do to build a model using these features !!!
    </p>
</div>