#### Installing Kaggle

In [3]:
#!pip install kaggle

##### Downloading Datasets using Kaggle API

In [4]:
#!kaggle competitions download -c ds4-predictive-modeling-challenge
#!unzip train_f*.zip
#!unzip test*.zip
#!unzip train_l*.zip
#!rm *.zip
#!chmod +r test_f*.csv
#!chmod +r train*.csv
#!chmod +r train_l*.csv
!ls *.csv

sample_submission.csv train_features.csv
test_features.csv     train_labels.csv


#### Loading CSVs into a Data Frame

In [5]:
import pandas as pd

In [6]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')

train_labels = pd.read_csv('train_labels.csv')

#### DF we will work with

We will ignore the test_df since it is not part of the process in creating a working model.

In [89]:
train_labels = pd.read_csv('train_labels.csv')
features = pd.read_csv('train_features.csv')

label = 'status_group'

#### Validation and holdout sets

The question we have is whether we should grab our validation and holdout set randomly or selectively?
https://www.fast.ai/2017/11/13/validation-sets/
https://www.slideshare.net/OwenZhang2/tips-for-data-science-competitions/8

For this problem, we will do with randomly.

In [90]:
from sklearn.model_selection import train_test_split as tts

In [91]:
x_train, x_val, y_train, y_val = tts(features,
                                     train_labels[label],
                                     train_size=0.60,
                                     test_size=0.40,
                                     stratify=labels,
                                     random_state=42
                                    )

In [92]:
x_val, x_hold, y_val, y_hold = tts(x_val,
                                   y_val, 
                                   train_size=0.60, 
                                   test_size = 0.40,
                                   stratify=y_val,
                                   random_state=42
                                  )

## Our 3 Datasets we will be working with

In [96]:
import numpy as np

In [93]:
x_train.shape[0], x_val.shape[0], x_hold.shape[0]
dfs = [x_train, x_val, x_hold]

In [99]:
def wrangle(df):
    df = df.copy()
    
    # Special case of near 0 value instead of 0
    df['latitude'] = df['latitude'].replace(-2e-08, 0)
    
    # 0s in these columns are meant as NaN.
    cols = ['construction_year', 'longitude', 'latitude']
    for col in cols:
        df[col] = df[col].replace(0, np.nan)
#!!!!!!!!!!!!!!!!! EXPERIMENT WITH MEDIAN INSTEAD !!!!!!!!!!!!!!!!!!!
        
        df[col] = df[col].fillna(df[col].mean())

#!!!!!!!!!!!!!!!!! EXPERIMENT WITH MEDIAN INSTEAD !!!!!!!!!!!!!!!!!!!
    
    # Convert date_recorded to datetime
    df['date_recorded'] = pd.to_datetime(df['date_recorded'], infer_datetime_format=True)
    
    # Extract year from date_recorded
    df['year_recorded'] = df['date_recorded'].dt.year
    
    # quantity & quantity_group are duplicates, so drop one
    df = df.drop(columns='quantity_group')
    
    # for categoricals with missing values, fill with the category 'MISSING'
    categoricals = df.select_dtypes(exclude='number').columns
    for col in categoricals:
        df[col] = df[col].fillna('MISSING')
    
    return df

## Stage 1, training df

#### Wrangling

In [113]:
wrangled=[]
stage = 1
for i in range(stage):
    wrangled.append(wrangle(dfs[i]))
    
x_train = wrangled[0]

#### Splitting categorical and continuous features

In [118]:
# Get a dataframe with all train columns except the target & id
x_train_c = x_train.drop(columns=['id'])

# Get a list of the numeric features
numeric_features = x_train_c.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = x_train_c.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

# Training df ready for pre-processing
x_train = x_train[features]

#### Catergorical Encoding

In [119]:
import category_encoders as ce

In [120]:
# One Hot encode categoricals of train
encoder = ce.OneHotEncoder(use_cat_names=True)
x_train_e = encoder.fit_transform(x_train)

#### Scaling and Standardizing

In [121]:
from sklearn.preprocessing import RobustScaler

In [122]:
# Scaler: fit_transform on train, transform on val & test
scaler = RobustScaler()
x_train_s = scaler.fit_transform(x_train_e)

#### Creating a Logistic Regression model

In [123]:
from sklearn.linear_model import LogisticRegression

In [124]:
model = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=-1)
model.fit(x_train_s, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto', n_jobs=-1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

#### The quick win

In [145]:
def pre_processing(df):
    df = wrangle(df)
    df = df.drop(columns=['id'])
    numeric_features = df.select_dtypes(include='number').columns.tolist()
    cardinality = df.select_dtypes(exclude='number').nunique()
    categorical_features = cardinality[cardinality <= 50].index.tolist()
    features=numeric_features + categorical_features
    df = df[features]
    encoder = ce.OneHotEncoder(use_cat_names=True)
    scaler = RobustScaler()
    df = scaler.fit_transform(encoder.fit_transform(df))
    return df

In [147]:
y_pred = model.predict(pre_processing(test_features))

submission = pd.read_csv('sample_submission.csv')
submission['status_group'] = y_pred
submission.to_csv('submission-02.csv', index=False)