# Exploration of Quora dataset

In [None]:
import sys
sys.path.append("..")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("dark_background") # comment out if using light Jupyter theme

dtypes = {"qid": str, "question_text": str, "target": int}
train = pd.read_csv("../data/train.csv", dtype=dtypes)
test = pd.read_csv("../data/test.csv", dtype=dtypes)

## 1. A first glance

In [None]:
train.head()

In [None]:
print("There are {} questions in train and {} in test".format(train.shape[0], test.shape[0]))

In [None]:
print("Target value is binary (values: {})".format(set(train["target"].unique())))

In [None]:
print("Number of toxic questions in training data is {} (proportion: {}).".format(train["target"].sum(), train["target"].mean()))

## 2. A closer look at the questions

### 2.1 Question length (characters)

In [None]:
train["text_length"] = train["question_text"].str.len()
train["text_length"].describe()

Most questions are relatively short, i.e., less than 100 characters. There are some exceptions, however, with a maximum of more than a thousand. Let's see how many characters we should consider.

In [None]:
for length in [100, 150, 200, 250, 300, 350, 500]:
    num = np.sum(train["text_length"] > length)
    print("There are {} questions ({}%) with more than {} characters."
          .format(num, np.round(num / len(train) * 100, 2), length))

The number of questions with more than 250 characters is already small and with more than 300 negligible. We can cut the questions at 300 or even just remove them. Would there be a difference between the length of toxic and sincere questions?

In [None]:
def split_on_target(data):
    toxic = data[data["target"] == 1]
    sincere = data[data["target"] == 0]
    return sincere, toxic

sincere, toxic = split_on_target(train)

In [None]:
def plot_density_plots(sincere_data, toxic_data, column, xlim=(0, 300), bin_size=5):
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    axes[0] = sns.distplot(sincere_data[column], ax=axes[0], bins=np.arange(xlim[0], xlim[1], bin_size))
    axes[0].set_title("Sincere questions")
    axes[1] = sns.distplot(toxic_data[column], ax=axes[1], bins=np.arange(xlim[0], xlim[1], bin_size))
    axes[1].set_title("Toxic questions")
    if xlim is not None:
        for ax in axes:
            ax.set_xlim(xlim[0], xlim[1])
    plt.suptitle("Comparison of {} between sincere and toxic questions".format(column))
    plt.show()

plot_density_plots(sincere, toxic, "text_length")

Toxic questions seem to have a higher chance of having somewhat more characters, although the medians seem to be more or less the same. The numbers confirm:

In [None]:
pd.concat([sincere["text_length"].describe(), toxic["text_length"].describe()], axis=1)

### 2.2 Question length (words)
A similar analysis can be done based on the number of _words_ per question, rather than the number of characters. To do this properly, we should probably first remove symbols and punctuation, but let's take a quick look.

In [None]:
train["words"] = train["question_text"].apply(lambda x: len(x.split(" ")))
sincere, toxic = split_on_target(train)

In [None]:
plot_density_plots(sincere, toxic, "words", xlim=(0, 60), bin_size=2)

The same conclusion seems to hold for the number of words. It is, thus, useful to include the question size as a feature in our models. Also, it seems that there are not many questions with more than 50 or 60 words:

In [None]:
for n in [50, 55, 60]:
    print("{} questions with more than {} words.".format(np.sum(train["words"] > n), n))