In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_test = pd.read_csv("../input/titanic/test.csv")
data_train = pd.read_csv("../input/titanic/train.csv")
data_train.head()
#data_train.shape #(891, 12)
#data_test.head()

We observe that there are a lot of features, so we will make a list of them and we will use them next:

In [None]:
features = [col for col in data_train.columns]
print(features)

Next we will decide on the types of each feature and that means everything except int or double (numeric)

In [None]:
def findObj(features):
    features_obj = []
    for feat in features:
        if data_train[feat].dtype == "object":
            features_obj.append(feat)
    return features_obj
findObj(features)

As seen right here, the findObj list may have Nan value because their type is object. So we will try to see where are the null values:

In [None]:
def where_null(data_train, features):
    features_where_null = []
    for feat in features:
        if data_train[feat].isnull().sum() >= 1:
            features_where_null.append(feat)
    return features_where_null
problem_feat = where_null(data_train, features)
problem_feat

Now we shall see how many null values are there:

In [None]:
for feat in problem_feat:
    print(feat + " " + str(data_train[feat].isnull().sum()))

We see that there are a lot of null values here. Now we compare these values to the number of actual value the column has:

In [None]:
for feat in problem_feat:
    print(feat + " " + str(891 - data_train[feat].isnull().sum()))

We notice that 'Embarked' and 'Age' has some useful information. Comparing them to 'Cabin', these columns are more useful. But we should see how useful are they regarding to predicting the outcome (we can use mutual information or analysing the data based on percentile). For example:

In [None]:
plt.figure(figsize = (10, 5))
plt.title("Based on gender")
sns.barplot(x = data_train['Sex'], y = data_train.index)
plt.xlabel("Secs")
plt.ylabel("Number")

We observe that there are slightly more men than women so it is balanced.

In [None]:
plt.figure(figsize = (10, 5), num = "Based on Class")
sns.barplot(x = data_train['Pclass'], y = data_train.index)

Classes are balanced; So now we will use mutual information to decide which data is more important regarding our model. When working with this data, it is important to separate the output('Survived') column to work with our data and to factorize the object type data because mutual information behaves differently when working with discrete features or continuos features

In [None]:
X = data_train.copy()
y = X.pop("Survived")

We notice that "Age" haas null values in it. So we put the mean of the values in place of nan:

In [None]:
X['Age'].fillna(X['Age'].mean(), inplace=True)

In [None]:
def transformFare(X):
    result = map(int, X['Fare'])
    result = pd.Series(result)
    #print(result)
    X.drop("Fare", axis = 'columns', inplace = True)
    #X.head()
    X.insert(8, "Fare", result)
    
def transformAge(X):
    result = map(int, X['Age'])
    result = pd.Series(result)
    X.drop("Age", axis = "columns", inplace = True)
    X.insert(4, "Age", result)
    
transformFare(X)
transformAge(X)
X.head()

So after preprocessing the data, we are ready to work with our dataset. We begin by implementing a mutual information function:

In [None]:
for cols in X.select_dtypes("object"):
    X[cols], _ = X[cols].factorize()
discrete_features = X.dtypes == int
discrete_features

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

mi_scores = make_mi_scores(X, y, discrete_features)
mi_scores

We observe that sex and Pclass influence the most the probability of survival. So we decide that we can use a linear model, Logistic Regression. But firstly, let's split the data into train and test:

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression
def logis(X_train, y_train, X_test, y_test):
    for c in [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]:
        logmodel = LogisticRegression(C = c, max_iter = 100000).fit(X_train, y_train)
        print("The score for training {} is :{:.3f}".format(c, logmodel.score(X_train, y_train)))
        print("The score for testing {} is :{:.3f}\n".format(c, logmodel.score(X_test, y_test)))
    
logis(X_train, y_train, X_test, y_test)

We observe that the best coefficients are the ones above 0.1 . It is not a model that performs poorly, but it can surely be improved. We could have used another methods for testing our model, like Leave One Out or shuffling the data, but it does not matter, since we have another data frame for testing the algorithm. But we shall see after testing it on the final dataset

Another model we can use is a SVM. It can be built in different methods by adjusting the parameters

In [None]:
from sklearn.linear_model import SGDClassifier

svm = SGDClassifier(loss = 'modified_huber')
 
svm.fit(X_train, y_train)
svm.score(X_test, y_test)
