In [2]:
%%writefile prep.py
import argparse
import os
from azureml.core import Run
import pandas as pd 
from pandas import read_csv
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib 
	
run = Run.get_context()
 
parser = argparse.ArgumentParser("prep")
 
parser.add_argument("--train", type=str, help="train")
parser.add_argument("--test", type=str, help="test")
parser.add_argument("--scaler", type=str, help="test")
 
args = parser.parse_args()
 
titanic_ds =run.input_datasets["raw_data"].to_pandas_dataframe()

df = titanic_ds.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

RANDOM_SEED=42

def prepare_age(df):
    # Fill in missing Age values from distribution of present Age values 
    mean = df["Age"].mean()
    std = df["Age"].std()
    is_null = df["Age"].isnull().sum()
    # compute enough (== is_null().sum()) random numbers between the mean, std
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = df["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df["Age"] = age_slice
    df["Age"] = df["Age"].astype(int)
    
    # Quantize age into 5 classes
    df['Age_Group'] = pd.qcut(df['Age'],5, labels=False)
    df.drop(['Age'], axis=1, inplace=True)
    return df

def prepare_fare(df):
    df['Fare'].fillna(0, inplace=True)
    df['Fare_Group'] = pd.qcut(df['Fare'],5,labels=False)
    df.drop(['Fare'], axis=1, inplace=True)
    return df 

def prepare_genders(df):
    genders = {"male": 0, "female": 1, "unknown": 2}
    df['Sex'] = df['Sex'].map(genders)
    df['Sex'].fillna(2, inplace=True)
    df['Sex'] = df['Sex'].astype(int)
    return df

def prepare_embarked(df):
    df['Embarked'].replace('', 'U', inplace=True)
    df['Embarked'].fillna('U', inplace=True)
    ports = {"S": 0, "C": 1, "Q": 2, "U": 3}
    df['Embarked'] = df['Embarked'].map(ports)
    return df
    


dataframe = prepare_embarked(prepare_genders(prepare_fare(prepare_age(df))))


array = dataframe.values
 
X = array[:,1:8]
Y = array[:,0]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
 
test_size = 0.33
seed = 7
 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
 
train=np.column_stack((X_train,Y_train))
test=np.column_stack((X_test,Y_test))
 
os.makedirs(args.train, exist_ok=True)
os.makedirs(args.test, exist_ok=True)
 
np.savetxt(args.train+"/train.txt",train,fmt="%f")
np.savetxt(args.test+"/test.txt",test,fmt="%f")
 
if not os.path.isdir(args.scaler):
	os.mkdir(args.scaler)
 
joblib.dump(scaler,args.scaler+"/scaler.joblib")

Writing prep.py
