## Import package

In [3]:
from __future__ import division
    
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from imblearn.over_sampling import ADASYN, SMOTE

ImportError: No module named numpy

## Load dataset: Train

In [None]:
train = pd.read_csv("../../../../../data/train/train.csv")
test = pd.read_csv("../../../../../data/test/test.csv")

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
f = open("../../../resources/continuousFeatures", "r")
continuous_columns = f.read().split(",")
f.close()

f = open("../../../resources/categoricalFeatures", "r")
categorical_columns = f.read().split(",")
f.close()

f = open("../../../resources/yesNoFeaturesNames", "r")
yes_no_columns = f.read().split(",")
f.close()

print("Number of numerical columns: {0}".format(len(continuous_columns)))
print("Number of categorical columns: {0}".format(len(categorical_columns)))
print("Number of yes/no columns: {0}".format(len(yes_no_columns)))

In [None]:
category, category_ount = np.unique(train["Target"], return_counts=True)
print("(Category, Count): {0}".format(zip(category, category_ount)))

## Fil Yes/No values

In [None]:
# yes=1 and no=0
def fill_yes_no(value):
    if value == "yes":
        return 1.0
    elif value == "no":
        return 0.0
    else:
        return float(value)

In [None]:
for column in yes_no_columns:
    train["new_{0}".format(column)] = train.apply(lambda row: fill_yes_no(row[column]), axis=1)
    test["new_{0}".format(column)] = test.apply(lambda row: fill_yes_no(row[column]), axis=1)

train = train.drop(yes_no_columns, axis=1)
test = test.drop(yes_no_columns, axis=1)

columns={"new_{0}".format(column): column for column in yes_no_columns}
train = train.rename(index=str, columns=columns)
test = test.rename(index=str, columns=columns)

## Fill Nan values

In [None]:
null_columns = train.isnull().any()

In [None]:
null_columns = train.columns[train.isnull().any()].tolist() ## continuous variables
null_columns

In [None]:
train[["Target","v2a1", 'v18q1', 'rez_esc', 'meaneduc', 'SQBmeaned']].head()

In [None]:
label_mean = train[null_columns].mean()
# label_mean.to_dict()

In [None]:
label_mean

In [None]:
for column in null_columns:
    train["new_{0}".format(column)] = train.apply(
    lambda row: label_mean[column] if np.isnan(row[column]) else row[column],
    axis=1)
    
    test["new_{0}".format(column)] = test.apply(
    lambda row: label_mean[column] if np.isnan(row[column]) else row[column],
    axis=1)    

train = train.drop(null_columns, axis=1)
test = test.drop(null_columns, axis=1)
train = train.rename(index=str, columns={"new_{0}".format(column): column for column in null_columns})
test = test.rename(index=str, columns={"new_{0}".format(column): column for column in null_columns})

## Standard scaled

In [None]:
from sklearn.preprocessing import StandardScaler
estimator = StandardScaler(copy=True, with_mean=True, with_std=True)
model = estimator.fit(train[continuous_columns])
X_scaled = model.transform(train[continuous_columns])

x_test_scaled = model.transform(test[continuous_columns])

## Train - Validation split

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X_scaled, train["Target"], test_size=0.2, stratify=train["Target"])

In [None]:
print(X_train.shape, y_train.shape)
print(X_validation.shape, y_validation.shape)

In [None]:
category, category_count = np.unique(y_train, return_counts=True)
print("(Category, Count): {0}".format(zip(category, category_count)))
      
category, category_count = np.unique(y_validation, return_counts=True)
print("(Category, Count): {0}".format(zip(category, category_count)))     

## Logistic Regression Classifier

In [None]:
def logistic_regression_prediction(reg_param, X_train, y_train, X_test):
    estimator = LogisticRegression(penalty="l2", C=reg_param)
    model = estimator.fit(X_train, y_train)
    return model.predict(X_train), model.predict(X_test)

In [None]:
def all_score(y_true, y_prediction):
    accuracy = accuracy_score(y_true, y_prediction)
    precision = precision_score(y_true, y_prediction, average="macro")
    recall = recall_score(y_true, y_prediction, average="macro")
    f1 = f1_score(y_true, y_prediction, average="macro")
    return accuracy, precision, recall, f1

In [None]:
def print_score(accuracy, precision, recall, f1):
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', f1)

## Classification: Logistic Regression Classifier

In [None]:
prediction_train, prediction_validation = logistic_regression_prediction(1e7, X_train, y_train, X_validation)

In [None]:
accuracy, precision, recall, f1 = all_score(y_validation, prediction_validation)
print_score(accuracy, precision, recall, f1)

In [None]:
confusion_mc = confusion_matrix(y_validation, prediction_validation)
print confusion_mc

In [None]:
# convert to a dataframe
df_cm = pd.DataFrame(confusion_mc,
                     index = [i for i in range(0,4)],
                     columns = [i for i in range(0,4)])
# plot graph
plt.figure(figsize=(6,6)) # define graph
sns.heatmap(df_cm, annot=True) # draw heatmap, add annotation

In [None]:
unique_elements, counts_elements = np.unique(prediction_validation, return_counts=True)
print(unique_elements)
print(counts_elements)

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 1', 'class 2', 'class 3', 'class 4']
print(classification_report(prediction_validation, y_validation, target_names=target_names))

## Balanced datasets - ADASYN

In [None]:
ada = ADASYN()
X_ada, y_ada = ada.fit_sample(X_scaled, train["Target"])

print(X_ada.shape, y_ada.shape)

X_ada_train, X_ada_validation, y_ada_train, y_ada_validation = train_test_split(X_ada, y_ada, test_size=0.2, stratify=y_ada)

prediction_ada_train, prediction_ada_validation = logistic_regression_prediction(1e7, X_ada_train, y_ada_train, X_ada_validation)

In [None]:
accuracy, precision, recall, f1 = all_score(y_ada_validation, prediction_ada_validation)
print_score(accuracy, precision, recall, f1)

In [None]:
confusion_mc = confusion_matrix(y_ada_validation, prediction_ada_validation)
# print confusion_mc

# convert to a dataframe
df_cm = pd.DataFrame(confusion_mc,
                     index = [i for i in range(0,4)],
                     columns = [i for i in range(0,4)])
# plot graph
plt.figure(figsize=(6,6)) # define graph
sns.heatmap(df_cm, annot=True) # draw heatmap, add annotation

## Balanced datasets - SMOTE

In [None]:
smo = SMOTE(kind="svm")
X_resampled, y_resampled = smo.fit_sample(X_scaled, train["Target"])

X_smo_train, X_smo_validation, y_smo_train, y_smo_validation = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled)

prediction_smo_train, prediction_smo_validation = logistic_regression_prediction(1e7, X_smo_train, y_smo_train, X_smo_validation)

In [None]:
accuracy, precision, recall, f1 = all_score(y_smo_validation, prediction_smo_validation)
print_score(accuracy, precision, recall, f1)

In [None]:
confusion_mc = confusion_matrix(y_smo_validation, prediction_smo_validation)
# print confusion_mc

# convert to a dataframe
df_cm = pd.DataFrame(confusion_mc,
                     index = [i for i in range(0,4)],
                     columns = [i for i in range(0,4)])
# plot graph
plt.figure(figsize=(6,6)) # define graph
sns.heatmap(df_cm, annot=True) # draw heatmap, add annotation

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 1', 'class 2', 'class 3', 'class 4']
print(classification_report(prediction_smo_validation, y_smo_validation, target_names=target_names))

## Hyper-parameters tuning

In [None]:
reg_params = [1e-7, 0.000001, 0.00001, 0.0001, 0.001,0.01,0.1,1,10,100,1000]
accuracy_scores = {"train": [], "validation": []}
precision_scores = {"train": [], "validation": []}
recall_scores = {"train": [], "validation": []}
f1_scores = {"train": [], "validation": []}
print("Number of neighbor: "),
for reg_param in reg_params:
    print(", {0}".format(reg_param)),
    prediction_smo_train, prediction_smo_validation = logistic_regression_prediction(reg_param, X_smo_train, y_smo_train, X_smo_validation)
    accuracy, precision, recall, f1 = all_score(y_smo_train, prediction_smo_train)
    accuracy_scores["train"].append(accuracy)
    precision_scores["train"].append(precision)
    recall_scores["train"].append(recall)
    f1_scores["train"].append(f1)  
    
    accuracy, precision, recall, f1 = all_score(y_smo_validation, prediction_smo_validation)
    accuracy_scores["validation"].append(accuracy)
    precision_scores["validation"].append(precision)
    recall_scores["validation"].append(recall)    
    f1_scores["validation"].append(f1)

In [None]:
regs = [1/reg for reg in reg_params]
regs

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2)

ax[0][0].semilogx(regs, accuracy_scores["train"], c="r", label="train")
ax[0][0].semilogx(regs, accuracy_scores["validation"], c="g", label="validation")
ax[0][0].set_title("Accuracy")
ax[0][0].legend()

ax[0][1].semilogx(regs, f1_scores["train"], c="r", label="train")
ax[0][1].semilogx(regs, f1_scores["validation"], c="g", label="validation")
ax[0][1].set_title("F1")
ax[0][1].legend()

ax[1][0].semilogx(regs, precision_scores["train"], c="r", label="train")
ax[1][0].semilogx(regs, precision_scores["validation"], c="g", label="validation")
ax[1][0].set_title("precision")
ax[1][0].legend()

ax[1][1].semilogx(regs, recall_scores["train"], c="r", label="train")
ax[1][1].semilogx(regs, recall_scores["validation"], c="g", label="validation")
ax[1][1].set_title("recall")
ax[1][1].legend()

In [None]:
nearest_neighbors = LogisticRegression(C=1000)
model = nearest_neighbors.fit(X_smo_train, y_smo_train)
prediction_smo_validation = model.predict(X_smo_validation)


# print confusion_mc

# convert to a dataframe
df_cm = pd.DataFrame(confusion_mc,
                     index = [i for i in range(0,4)],
                     columns = [i for i in range(0,4)])
# plot graph
plt.figure(figsize=(6,6)) # define graph
sns.heatmap(df_cm, annot=True) # draw heatmap, add annotation

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 1', 'class 2', 'class 3', 'class 4']
print(classification_report(prediction_smo_validation, y_smo_validation, target_names=target_names))

In [None]:
prediction_kaggle = model.predict(x_test_scaled)

In [None]:
unique_elements, counts_elements = np.unique(prediction_kaggle, return_counts=True)
print(unique_elements)
print(counts_elements)

In [None]:
id_kaggle = test["Id"]

In [None]:
d = {"Id": id_kaggle, "Target": prediction_kaggle}
data = pd.DataFrame(d)

In [None]:
data.head(5)

In [None]:
data.to_csv("/home/ubuntu/Documents/costa_rican_household_poverty/submission/sklearn/smote/logisticRegression.csv", index=False)