In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('titanic2_train.csv')
test = pd.read_csv('titanic2_test.csv')
concat = pd.concat([train,test])

In [19]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Family
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,49999.5,0.42774,2.10691,38.355472,0.39769,0.45456,43.92933,0.38147
std,28867.657797,0.494753,0.837727,18.009589,0.862566,0.950076,69.54218,0.48575
min,0.0,0.0,1.0,0.08,0.0,0.0,0.68,0.0
25%,24999.75,0.0,1.0,25.0,0.0,0.0,10.04,0.0
50%,49999.5,0.0,2.0,38.355472,0.0,0.0,24.49,0.0
75%,74999.25,1.0,3.0,53.0,1.0,1.0,33.56,1.0
max,99999.0,1.0,3.0,87.0,8.0,9.0,744.66,1.0


In [22]:
def titanic(df) :
    
    # Normalise Age
    from sklearn.preprocessing import MinMaxScaler
    mms = MinMaxScaler()
    df[['Age', 'Fare']] = mms.fit_transform(df[['Age', 'Fare']])
    
    # Create family categories
    df['Family'] = df['SibSp'] + df['Parch']
    df['Family'].values[df['Family'].values > 0] = 1
    
    # Fill blanks in cabin and embarked
    df['Cabin'] = df['Cabin'].fillna('X')
    df['Embarked'] = df['Embarked'].fillna('X')
    
    from sklearn.impute import SimpleImputer
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    df[['Age', 'Fare']] = imp.fit_transform(df[['Age', 'Fare']])

    #Create decks
    df['Deck'] = df['Cabin'].str[:1]
    
    # Select columns
    amend = ['Pclass','Sex','Embarked','Age','Family','Deck']
    df_amend = df[amend]
    
    # Get dummies
    dummy = pd.get_dummies(df_amend, drop_first=True)
    
    return dummy

In [23]:
X = titanic(train)
y = train['Survived']

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [58]:
lin_reg = LinearRegression()
log_reg = LogisticRegression(max_iter=200)
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

models = [lin_reg, log_reg, tree, forest]
accuracy = []

for model in models :
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    accuracy.append(model.score(X_test, y_test))
    
results_zip = zip(models, accuracy)
results = dict(results_zip)
results

{LinearRegression(): 0.32552686719737933,
 LogisticRegression(max_iter=200): 0.7671,
 DecisionTreeClassifier(): 0.7536666666666667,
 RandomForestClassifier(): 0.7568666666666667}

In [60]:
log_reg = LogisticRegression(max_iter=200, penalty='none')
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
log_reg.score(X_test, y_test)

0.7695

In [59]:
log_reg = LogisticRegression(max_iter=200)

params = {'penalty' : ['l1', 'l2', 'elasticnet', 'none']}

log_reg_cv = GridSearchCV(log_reg, params, cv=5)
log_reg_cv.fit(X, y)

print(log_reg_cv.best_score_)
print(log_reg_cv.best_params_)

Traceback (most recent call last):
  File "C:\Users\kurtw\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kurtw\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\kurtw\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\kurtw\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kurtw\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.

0.77318
{'penalty': 'none'}
