In [None]:
import json
import datetime
import os
import time
import sys

import pandas as pd
import numpy as np
from sklearn import preprocessing

import matplotlib.pyplot as plt

sys.path.append('..')
from preprocessing import load, process, preprocess_and_save
from aggregation import ohe_explicit

# Loading the data

The file *preprocessing.py* contains function to load the JSON data, convert to Pandas DataFrames, and perform basic preprocessing (e.g., delete columns that are constant). 

In [None]:
train = load("../data/train.csv", nrows=100000)
test = load("../data/test.csv", nrows=10000)

# preprocess_and_save("../data", nrows_train=100000, nrows_test=10000)
train, test = process(train, test)

In [None]:
preprocess_and_save("../data", nrows_train=100000, nrows_test=10000)

# Categoricals with many values

There are a number of categorical features with many different values. This is an issue, specifically in the cases where those categorical features are not ordinal, and therefore label encoding them does not make sense. The only choice we are left with, is OHE. However naively performing this step would add hundreds of columns for each original categorical feature - in the end yielding potentially thousands of super sparse features. What I would like to explore, is whether there exist **specific values** with predictive value significantly higher than average. For example, looking at the particular country might be a weak predictor. However there might be 5 specific countries with a huge revenue deviation from the average (and enough samples to consider this discrepancy statistically significant).

In [None]:
countries = train['country']
print("There are {} different countries in our dataset".format(len(countries.unique())))

In [None]:
aggregations = {'target':['mean', 'count']}

countries = train[["country", "target"]].groupby("country", as_index=False).agg(aggregations)
countries.columns = ["country", "targetMean", "occurenceCount"]

# Let's focus only on countries with multiple records to preserve some statistical significance
keep = 10
usual_countries = countries.sort_values("occurenceCount", ascending=False).head(keep)

global_average = train["target"].mean()
usual_countries["deviation"] = usual_countries["targetMean"] - global_average
usual_countries.plot.bar(x="country", y="deviation")

# USA seems quite different

What we can find here is that USA is very different to any other country with significant sample count. What we can do with this information? We instead of using OHE to code every country in our dataset, we can probably get away with a single boolean column: **is this record coming from the USA?**

In [None]:
cities = train['city']
print("There are {} different cities in our dataset".format(len(cities.unique())))

In [None]:
aggregations = {'target':['mean', 'count']}

cities = train[["city", "target"]].groupby("city", as_index=False).agg(aggregations)
cities.columns = ["city", "targetMean", "occurenceCount"]

# Let's focus only on countries with multiple records to preserve some statistical significance
keep = 10
usual_cities = cities.sort_values("occurenceCount", ascending=False).head(keep)

global_average = train["target"].mean()
usual_cities["deviation"] = usual_cities["targetMean"] - global_average
usual_cities.plot.bar(x="city", y="deviation")

# What about the cities?

Here we can see some pretty strong deviations. However we need to note that the top ones come from US cities, so part of the variance these cities explain, is already included in the information that they belong to the USA. However their deviation is considerably higher than that of USA alone (1.0 vs 0.3) so including those columns might still be beneficial. The deviation we see is actually very distorted because of the USA outlier. Perhaps it would make more sense to only focus on the deviation from the average of non-USA cities.

In [None]:
train_not_us = train[train['country'] != "United States"]
train_us = train[train['country'] == "United States"]

outside_us_avg = train_not_us['target'].mean()
us_avg = train_us["target"].mean()

print("Average in US: {}\nAverage outside US: {}".format(us_avg, outside_us_avg))

**Let's then repeat out analysis but this time separatly for the pieces of data**

In [None]:
aggregations = {'target':['mean', 'count']}

# US case
def city_deviation(df, title="Deviation per city"):
    cities = df[["city", "target"]].groupby("city", as_index=False).agg(aggregations)
    cities.columns = ["city", "targetMean", "occurenceCount"]

    # Let's focus only on countries with multiple records to preserve some statistical significance
    keep = 10
    usual_cities = cities.sort_values("occurenceCount", ascending=False).head(keep)

    global_average = df["target"].mean()
    usual_cities["deviation"] = usual_cities["targetMean"] - global_average
    ax = usual_cities.plot.bar(x="city", y="deviation", title=title, rot=45, legend=False)
    ax.set_xlabel("City")
    ax.set_ylabel("Deviation")
    
city_deviation(train_us, title="Deviation from the mean - US")
city_deviation(train_not_us, title="Deviation from the mean - Rest of the world")

## Much better!

Now we can clearly see what information should be included besides the country (or to be exact, whether or not the country is the US). For example it makes no sense to include Los Angeles or Mountain View even though they deviate from the global average, because all this deviation is explained by the fact that they exist in the US! Instead we should include Chicago, New York, Austin, Seattle and maybe Palo Alto. And as we can see the deviations are much smaller outside the US, with the exception of Toronto which MUST be included.

### Food for thought
It makes sense that deviations outside the US are smaller because the target itself is considerable lower. Perhaps we should look at relative deviations instead?

In [None]:
check = ohe_explicit(train)
check.head()  