### 1st level. Costa Rican Household Poverty Level Prediction

- [자료1](https://www.kaggle.com/willkoehrsen/a-complete-introduction-and-walkthrough)

#### Data set

- training set (9557, 143 = 142 + target)
- testing set (23856, 142)

<br />

- id: 식별자
- idhogar: household 식별자
- parentesco1: indicates if this person is the head of the household
- Target: 1(extreme poverty), 2(moderate pooverty), 3(vulnerable households), 4(non vulnerable households)

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use("fivethirtyeight")
plt.rcParams["font.size"] = 18
plt.rcParams["patch.edgecolor"] = 'k'
pd.options.display.max_columns = 150

In [None]:
train = pd.read_csv("../input/costa-rican-household-poverty-prediction/train.csv")
test = pd.read_csv("../input/costa-rican-household-poverty-prediction/test.csv")
train.head()

In [None]:
train.info()

In [None]:
test.info()

#### Integer Columns

In [None]:
train.select_dtypes(np.int64).nunique().value_counts().sort_index().plot.bar(
    color="blue", figsize=(8, 6), edgecolor='k', linewidth=2)

plt.xlabel("Number of Unique Values")
plt.ylabel("Count")
plt.title("Count of Unique Values in Integer Columns")
plt.show()

#### Float Columns

In [None]:
from collections import OrderedDict

plt.figure(figsize=(20, 16))
plt.style.use("fivethirtyeight")

colors = OrderedDict({1: "red", 2: "orange", 3: "blue", 4: "green"})
poverty_mapping = OrderedDict({1: "extreme", 2: "moderate", 3: "vulnerable", 4: "non vulerable"})

for i, col in enumerate(train.select_dtypes("float")):
    ax = plt.subplot(4, 2, i + 1)
    for poverty_level, color in colors.items():
        sns.kdeplot(train.loc[train.Target == poverty_level, col].dropna(),
                    ax=ax, color=color, label=poverty_mapping[poverty_level])
    
    plt.xlabel(f"{col}")
    plt.ylabel("Density")
    plt.title(f"{col.capitalize()} Distribution")
    plt.legend(loc="best")
plt.subplots_adjust(top=2)
plt.show()

필사의 결과와 달라짐.<br />
density 값이 달라지기만 하고 분포를 유지했다면 그냥 쓰려고 했을 텐데 생각보다 분포의 모양도 많이 달라졌다.<br />
분포의 모양이 달라진 것도 그렇고 최빈값이 달라진 것도 그렇고. 생각보다 좀 그래.

#### Object Columns

In [None]:
train.select_dtypes("object").head()

- dependency: Dependancy rate (numberr of members of the household younger than 19 or older than 64) / (number of member of household between 19 and 64)
- edjefe: years of education of male head of household, based on the interation of escolari (years of education), head of household and gender (1: yes, 0: no)
- edjefa: years of education of female head of household, based on the interation of escolari (years of education), head of household and gender (1: yes, 0: no)

In [None]:
mapping = {"yes": 1, "no": 0}
for df in [train, test]:
    df["dependency"] = df["dependency"].replace(mapping).astype(np.float64)
    df["edjefa"] = df["edjefa"].replace(mapping).astype(np.float64)
    df["edjefe"] = df["edjefe"].replace(mapping).astype(np.float64)

In [None]:
train[["dependency", "edjefa", "edjefe"]].describe()

In [None]:
plt.figure(figsize=(16, 12))

for i, col in enumerate(["dependency", "edjefa", "edjefe"]):
    ax = plt.subplot(3, 1, i + 1)
    for poverty_level, color in colors.items():
        sns.kdeplot(train.loc[train.Target == poverty_level, col].dropna(),
                    ax=ax, color=color, label=poverty_mapping[poverty_level])
        
    plt.xlabel(f"{col}")
    plt.ylabel("Density")
    plt.title(f"{col.capitalize()} Distribution")
    plt.legend(loc="best")
plt.subplots_adjust(top=2)
plt.show()

역시 그림이 다르다.<br />
dependency에 대한 데이터가 덜 풍부해졌거나 소음이 제거되었거나 둘 중 하나

In [None]:
test["Target"] = np.nan
data = train.append(test, ignore_index=True)

### Exploring Label Distribution

In [None]:
heads = data.loc[data.parentesco1 == 1].copy()
train_labels = data.loc[(data["Target"].notnull()) & (data["parentesco1"] == 1),
                        ["Target", "idhogar"]]
label_counts = train_labels.Target.value_counts().sort_index()

In [None]:
label_counts

In [None]:
label_counts.plot.bar(figsize=(8, 6), color=colors.values(), edgecolor='k', linewidth=2)

plt.xlabel("Poverty Level")
plt.xticks([x - 1 for x in poverty_mapping.keys()], list(poverty_mapping.values()),
           rotation=60)
plt.ylabel("Count")
plt.title("Poverty Level Breakdown")
plt.show()

#### Identify Errors

In [None]:
all_equal = train.groupby("idhogar")["Target"].apply(lambda x: x.nunique() == 1)
not_equal = all_equal[all_equal != True]
print("There are {} households where the family members do not all have the same target."
      .format(len(not_equal)))

In [None]:
train[train.idhogar == not_equal.index[0]][["idhogar", "parentesco1", "Target"]]

#### Families without Heads of Household

In [None]:
households_leader = train.groupby("idhogar")["parentesco1"].sum()
households_no_head = train.loc[train["idhogar"].isin(
    households_leader[households_leader == 0].index), :]
print("There are {} households without a head.".format(households_no_head.idhogar.nunique()))

In [None]:
households_no_head_equal = households_no_head.groupby("idhogar")["Target"].apply(
    lambda x: x.nunique() == 1)
print("{} Households with no head have different labels."
      .format(sum(households_no_head_equal == False)))

#### Correct Errors

In [None]:
for household in not_equal.index:
    true_target = int(train[(train["idhogar"] == household) &
                            (train["parentesco1"] == 1.0)]["Target"])
    train.loc[train["idhogar"] == household, "Target"] = true_target

In [None]:
all_equal = train.groupby("idhogar")["Target"].apply(lambda x: x.nunique() == 1)
not_equal = all_equal[all_equal != True]
print("There are {} households where the family members do not all have the same target."
      .format(len(not_equal)))

### Missing Variables

In [None]:
missing = pd.DataFrame(data.isnull().sum()).rename(columns={0: "total"})
missing["percent"] = missing["total"] / len(data)
missing.sort_values("percent", ascending=False).head(10).drop("Target")

v18q1: Number of tablets<br />
household에 tablet 수는 왜? 정말 왜? tablet에 내가 모르는 뜻이라도 있는 건가?

#### Function of Plot Value Counts

In [None]:
def plot_value_counts(df, col, heads_only=False):
    if heads_only:
        df = df.loc[df["percentesco1"] == 1].copy()
    
    plt.figure(figsize=(8, 6))
    df[col].value_counts().sort_index().plot.bar(color="blue", edgecolor='k', linewidth=2)
    
    plt.xlabel(f"{col}")
    plt.ylabel("Count")
    plt.title(f"{col} Value Counts")
    plt.show()

In [None]:
plot_value_counts(heads, "v18q1")

In [None]:
heads.groupby("v18q")["v18q1"].apply(lambda x: x.isnull().sum())

In [None]:
data["v18q1"] = data["v18q1"].fillna(0)

v2a1: Monthly rent payment

In [None]:
own_variables = [x for x in data if x.startswith("tipo")]
data.loc[data["v2a1"].isnull(), own_variables].sum().plot.bar(
    figsize=(10, 8), color="green", edgecolor='k', linewidth=2)

plt.xticks([0, 1, 2, 3, 4], ["Owns and Paid Off", "Owns and Paying", "Rented",
                             "Precarious", "Other"], rotation=60)
plt.title("Home Ownership Status for Households Missing Rent Payments", size=18)
plt.show()

tipovivi1, =1 own and fully paid house<br />
tipovivi2, "=2 own, paying in installments"<br />
tipovivi3, =1 rented<br />
tipovivi4, =1 precarious<br />
tipovivi5, "=1 other(assigned, borrowed)"

In [None]:
data.loc[(data["tipovivi1"] == 1), "v2a1"] = 0
data["v2a1-missing"] = data["v2a1"].isnull()
data["v2a1-missing"].value_counts()

In [None]:
data.loc[data["rez_esc"].notnull()]["age"].describe()

In [None]:
data.loc[data["rez_esc"].isnull()]["age"].describe()

In [None]:
data.loc[((data["age"] > 19) | (data["age"] < 7)) & (data["rez_esc"].isnull()), "rez_esc"] = 0
data["rez_esc-missing"] = data["rez_esc"].isnull()

In [None]:
data.loc[data["rez_esc"] > 5, "rez_esc"] = 5

### Plot Two Categorical Variables

In [None]:
def plot_categoricals(x, y, data, annotate=True):
    raw_counts = pd.DataFrame(data.groupby(y)[x].value_counts(normalize=False))
    raw_counts = raw_counts.rename(columns={x: "raw_count"})
    
    counts = pd.DataFrame(data.groupby(y)[x].value_counts(normalize=True))
    counts = counts.rename(columns={x: "normalized_count"}).reset_index()
    counts["percent"] = 100 * counts["normalized_count"]
    counts["raw_count"] = list(raw_counts["raw_count"])
    
    sqr_min = int(np.sqrt(raw_counts["raw_count"].min()))
    sqr_max = int(np.sqrt(raw_counts["raw_count"].max()))
    msizes = list(range(sqr_min, sqr_max, int((sqr_max - sqr_min) / 5)))
    
    plt.figure(figsize=(14, 10))
    
    plt.scatter(counts[x], counts[y], edgecolor='k', color="lightgreen",
                s=100 * np.sqrt(counts["raw_count"]), marker='o', alpha=0.6, linewidth=1.5)
    
    markers = []
    for size in msizes:
        markers.append(plt.scatter([], [], s=100 * size,
                                   label=f"{int(round(np.square(size) / 100) * 100)}",
                                   color="lightgreen", alpha=0.6, edgecolor='k', linewidth=1.5))
        
    if annotate:
        for i, row in counts.iterrows():
            plt.annotate(xy=(row[x] - (1 / counts[x].nunique()),
                             row[y] - (0.15 / counts[y].nunique())),
                         color="navy", s=f"{round(row['percent'], 1)}%")
    
    plt.annotate(f"* Size represents raw count while % is for a given y value.",
                 xy=(0, 1), xycoords="figure points", size=10)
    
    plt.xticks(counts[x].unique())
    plt.xlim((counts[x].min() - (6 / counts[x].nunique()),
              counts[x].max() + (6 / counts[x].nunique())))
    plt.xlabel(f"{x}")
    
    plt.yticks(counts[y].unique())
    plt.ylim((counts[y].min() - (4 / counts[y].nunique()),
              counts[y].max() + (4 / counts[y].nunique())))
    plt.ylabel(f"{y}")
    
    plt.grid(None)
    plt.legend(handles=markers, title="Counts", labelspacing=3, handletextpad=2,
               fontsize=16, loc=(1.10, 0.19))
    plt.title(f"{y} vs {x}")
    plt.show()

In [None]:
plot_categoricals("rez_esc", "Target", data)

In [None]:
plot_categoricals("escolari", "Target", data, annotate=False)

In [None]:
plot_value_counts(data[(data["rez_esc-missing"] == 1)], "Target")

In [None]:
plot_value_counts(data[(data["v2a1-missing"] == 1)], "Target")

living point: 때론 결측값도 결측값 자체로 중요한 정보가 되기도 한다.

## Feature Engineering

### Column Definitions

#### Define Variable Categories

In [None]:
id_