In [305]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')

In [306]:
data = pd.concat([train,test])

In [307]:
data['Sex'].replace(['male','female'], [1,2], inplace = True)

In [308]:

data['Embarked'].isnull().sum()

2

In [309]:
data.loc[data['Embarked'].isnull(), 'Embarked'] = 'S' # As most of the passengers embarked from Southampton
data['Embarked'].replace(['S','Q','C'],['1','2','3'],inplace=True)

In [310]:
data['Family Size'] = data['SibSp'] + data['Parch']

In [311]:
data.pop('SibSp')
data.pop('Parch')

0      0
1      0
2      0
3      0
4      0
      ..
413    0
414    0
415    0
416    0
417    1
Name: Parch, Length: 1309, dtype: int64

In [312]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Family Size
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,A/5 21171,7.2500,,1,1
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,38.0,PC 17599,71.2833,C85,3,1
2,3,1.0,3,"Heikkinen, Miss. Laina",2,26.0,STON/O2. 3101282,7.9250,,1,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",2,35.0,113803,53.1000,C123,1,1
4,5,0.0,3,"Allen, Mr. William Henry",1,35.0,373450,8.0500,,1,0
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",1,,A.5. 3236,8.0500,,1,0
414,1306,,1,"Oliva y Ocana, Dona. Fermina",2,39.0,PC 17758,108.9000,C105,3,0
415,1307,,3,"Saether, Mr. Simon Sivertsen",1,38.5,SOTON/O.Q. 3101262,7.2500,,1,0
416,1308,,3,"Ware, Mr. Frederick",1,,359309,8.0500,,1,0


In [313]:
data['Name'].str.extract('([A-Za-z]+)\.').value_counts()
data['Name'].count()

1309

In [314]:
data['Initial'] = 0

for i in data:
    data['Initial'] = data['Name'].str.extract('([A-Za-z]+)\.')
    
data['Initial'].replace(['Master', 'Rev', 'Dr', 'Col', 'Mlle', 'Ms', 'Major', 'Mme', 'Capt', 'Lady', 'Jonkheer',
                            'Dona', 'Don', 'Countess', 'Sir'],
                           ['Mr', 'Mr', 'Mr', 'Mr', 'Miss', 'Miss', 'Mr', 'Mrs', 'Mr', 
                            'Mrs','Other', 'Miss', 'Mr', 'Mrs', 'Mr'],inplace=True)

In [315]:
age_mean = data.groupby('Initial')['Age'].mean().map(np.ceil)

for idx, val in zip(age_mean.index, age_mean.values):
    data.loc[(data['Age'].isnull())&(data['Initial'] == idx), "Age"] = val


In [316]:
data['Age'].isnull().any()

False

In [317]:
data['Embarked'].isnull().any()

False

In [318]:
data['Cabin'].value_counts()

C23 C25 C27        6
G6                 5
B57 B59 B63 B66    5
C22 C26            4
F33                4
                  ..
A14                1
E63                1
E12                1
E38                1
C105               1
Name: Cabin, Length: 186, dtype: int64

In [319]:
data.loc[data['Fare'].isnull(), 'Fare'] = data.Fare.mean()
data['Fare'].isnull().any()

False

In [320]:
data['Initial'].replace(['Mr', 'Mrs', 'Miss', 'Other'], ['1', '2', '3', '4'], inplace=True)



In [321]:
data[['Sex', 'Age', 'Pclass', 
      'Embarked', 'Initial', 'Family Size']] = data[['Sex', 'Age', 'Pclass', 
                                                     'Embarked', 'Initial', 'Family Size']].astype('int64')

In [322]:
data.drop(['Ticket', 'Cabin','Name'],axis=1,inplace=True)

In [323]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler

robust_scaler = RobustScaler()
minmax_scaler = MinMaxScaler()

data['Age'] = minmax_scaler.fit_transform(data['Age'].values.reshape(-1,1))

data['Fare'] = robust_scaler.fit_transform(data['Fare'].values.reshape(-1,1))



In [324]:
df = data.copy()
training = df[df['PassengerId'] <= 891]
testing  = df[df['PassengerId'] >891]

testing = testing.drop('Survived', axis=1)
testing

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Family Size,Initial
0,892,3,1,0.4250,-0.283372,2,0,1
1,893,3,2,0.5875,-0.318839,1,1,2
2,894,2,1,0.7750,-0.203886,2,0,1
3,895,3,1,0.3375,-0.247729,1,0,1
4,896,3,2,0.2750,-0.092676,1,2,2
...,...,...,...,...,...,...,...,...
413,1305,3,1,0.3875,-0.273927,1,0,1
414,1306,1,2,0.4875,4.039736,3,0,3
415,1307,3,1,0.4750,-0.308146,1,0,1
416,1308,3,1,0.3875,-0.273927,1,0,1


In [325]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn import svm
from sklearn import neighbors
from sklearn import linear_model

In [326]:
X = training.drop('Survived', axis =1)
Y = training['Survived']

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)


In [327]:
## SVM

svm_model = svm.SVC()
svm_model.fit(X_train, Y_train)

svm_predictions = svm_model.predict(X_test)
svm_acc = accuracy_score(Y_test, svm_predictions)

print(classification_report(Y_test, svm_predictions))

              precision    recall  f1-score   support

         0.0       0.60      1.00      0.75       134
         1.0       0.00      0.00      0.00        89

    accuracy                           0.60       223
   macro avg       0.30      0.50      0.38       223
weighted avg       0.36      0.60      0.45       223



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [328]:
## K nearest neighbor

knn_model = neighbors.KNeighborsClassifier(25)
knn_model.fit(X_train, Y_train)

knn_predictions = knn_model.predict(X_test)
knn_acc = accuracy_score(Y_test, knn_predictions)

print(classification_report(Y_test, knn_predictions))

              precision    recall  f1-score   support

         0.0       0.60      0.83      0.69       134
         1.0       0.38      0.16      0.22        89

    accuracy                           0.56       223
   macro avg       0.49      0.49      0.46       223
weighted avg       0.51      0.56      0.51       223



In [329]:
## Linear Regression

lreg_model = linear_model.LogisticRegression()
lreg_model.fit(X_train, Y_train)

lreg_predictions = lreg_model.predict(X_test)

print(classification_report(Y_test, lreg_predictions))


              precision    recall  f1-score   support

         0.0       0.85      0.81      0.83       134
         1.0       0.74      0.79      0.76        89

    accuracy                           0.80       223
   macro avg       0.79      0.80      0.80       223
weighted avg       0.81      0.80      0.80       223



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [330]:
testing['Survived'] = lreg_model.predict(testing)
print(np.array(testing['Survived']) )


[0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0.
 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0.
 1. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0.
 1. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 1. 1.
 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.
 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1.
 1. 1. 1. 0. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1.
 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 0.
 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.
 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0.

In [339]:
output = testing[['PassengerId', 'Survived']]
output.set_index('PassengerId', inplace=True)
output.to_csv('./output_lreg.csv')