In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.utils import resample

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!head /kaggle/input/titanic/gender_submission.csv

## Load in data sets

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

# rebalence training set

In [None]:
train["Survived"].value_counts()

In [None]:
# seperate call df's in train
train_c0 = train[train["Survived"] == 0]
train_c1 = train[train["Survived"]==1]

In [None]:
train_c1_upsample = resample(train_c1,
             replace=True,
             n_samples=len(train_c0),
             random_state=42)


In [None]:
train_c1_upsample.head()

In [None]:
train_upsampled = pd.concat([train_c1_upsample, train_c0])
train_upsampled.shape

In [None]:
train_upsampled.Survived.value_counts()

# K.I.S.S  approach

In [None]:
# keeping it simple going with a pipeline that imputes,scales then 
# feeds to a Linear LogisticRegression model
#from sklearn.linear_model import LogisticRegression 
#from sklearn.impute import SimpleImputer
#from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import StandardScaler

#model = LogisticRegression()
#imputer = SimpleImputer(strategy="median")
#scalar = StandardScaler()
#pipe = Pipeline([
#    ("SimpleImputer", imputer),
#    ("StandardScalar", scalar),
#    ("LogisticRegression",model)
#])

#X = train_upsampled[train_upsampled.describe(include=[np.number]).columns.drop("Survived")]
#y = train_upsampled["Survived"]

#pipe.fit(X, y)

#y_pred = pipe.predict(test[X.columns])
#test["Survived"] = y_pred
#sub = test[["PassengerId","Survived"]]
#sub["Survived"].value_counts()
#sub.to_csv("/kaggle/working/submission.csv", index=False)

# Result from scoring: 0.64832

`Simple Imputer
Standard Scaler
LogisticRegression
`

So not great but I'm only using a few features and some of them are missing quite a bit of information so lets make it a bit better this time

In [None]:
# bringing my upsampled dataframe from my kiss method
# if not working re-run cells
train_upsampled.head()

In [None]:
# some feature engineering
train = train_upsampled.copy(deep=True)

def feature_engineer(df=None):
    # map gender to 0 or 1
    df.Sex = df["Sex"].map({"male":0,"female":1})
    # map embarked to 1,2 or 3 respectivly
    df["Embarked"] = df["Embarked"].map({"S":1,"C":2,"Q":3})
    # engineer length of names column
    df["NameLength"] = df["Name"].apply(lambda x:len(x))
    # select only number columns
    df = df[df.describe(include=[np.number]).columns]
    return df


train = feature_engineer(train)

In [None]:
print(len(train.describe(include=np.number).columns)-1)
print(len(train_upsampled.describe(include=np.number).columns)-1)


In [None]:
train.head()

In [None]:
def RMN(df=None, print_col=False, threshold=0.195):
    # with df as input test and see if columns have a 
    # majority null and print statistics about the columns
    if df is None:
        print("please add a pandas.DataFrame object to input")
        return
    df = df.copy()
    columns = df.columns
    shape = df.shape[0]
    drop_list = []
    ratios = []
    for col in columns:
        ratio = df[col].isnull().sum()/shape
        if ratio > threshold:
            drop_list.append(col)
            ratios.append(ratio)
    if print_col:
        for col,ratio in zip(drop_list, ratios):
            print("column: "+ col + " had a nan ratio of: "+ str(ratio) + "\n")
        return "dry run done rerun with print_col = False to return df with dropped columns\n"
    else:
        return df.drop(drop_list, axis=1)
    
        
        
    

In [None]:
# targeting age ratio to not select age because we are going to impute it
train = RMN(df=train, print_col=False, threshold=0.194)

In [None]:
train.head()

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train.drop("Survived", axis=1), train["Survived"], test_size = .25, random_state = 42)


# Random Forest Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier as random_forest
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score

In [None]:
model = random_forest(n_estimators = 1500,
                      max_depth = 10, 
                      max_features = 4, 
                      bootstrap = True, 
                      random_state = 42)
imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

In [None]:
pipe = Pipeline([("SimpleImputer", imputer),
    ("StandardScaler", scaler),
    ("random forest", model)])


In [None]:
pipe.fit(x_train,y_train)

In [None]:
y_pred = pipe.predict(x_test)
y_pred_prob = pipe.predict_proba(x_test)[:,1]


In [None]:
acc = accuracy_score(y_test, y_pred)
cmatrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred_prob)

In [None]:
print("Model accuracy: "+str(acc)+"\nModel F1 Score: "+str(f1)+"\n")
print("confusion matrix\n\n")
print(cmatrix)
print("\nROC_AUC score: "+str(roc))

# making a prediction of the prediction score at 87% accuracy\
### lets get the acual test case ready

In [None]:
# feature engineering and cleaning
X_test = test.copy(deep=True)
X_test = feature_engineer(X_test)
X_test.head()


In [None]:
y_pred = pipe.predict(X_test[x_test.columns])
X_test["Survived"] = y_pred
sub = X_test[["PassengerId","Survived"]]
sub["Survived"].value_counts()

In [None]:
sub.head()

In [None]:
sub.to_csv("/kaggle/working/submission-rf.csv", index=False)