# **Imports**

In [None]:
import pandas as pd
from random import randint
import numpy as np
import math

# **Reading the data**

In [None]:
data = pd.read_csv('train.csv')
dl = len(data)
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
testd = pd.read_csv('test.csv')
testd

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


Here I've merged train and test data becuase some categorical columns will be converted to numerical values, and in this process, if they were seprated, they would get different numerical values for the same categorical vlaues!

In [None]:
w_d = data.append(testd)

# **Preprocessing**

I've converted categorical columns to numerical ones, and created an one-hot encoding representation for 'Embarked' columns.

In [None]:
def preprocessing(data):
  # Categorical columns to numeric columns!
  data.Ticket = data.Ticket.factorize()[0]
  data.Sex = data.Sex.factorize()[0]
  data.Cabin = data.Cabin.factorize()[0]
  data.Embarked = data.Embarked.factorize()[0]
  data.Name = data.Name.factorize()[0]

  data = pd.concat([data, pd.get_dummies(data.Embarked)], axis=1)
  
  # In Age columns, there are some NAN cells, so we've replaced them with the mean of column.
  data.Age = data.Age.fillna(data.Age.mean())
  for i in data.columns:
    if i!= 'Survived':
      data[i] = data[i].fillna(data[i].mean())

  s = list(data.columns)
  s.remove('Survived')
  s.remove('Embarked')
  s.append('Survived')
  
  data = data[s]
  data = data.reindex(s, axis=1)

  dn = np.array(data)
  return dn, s

In [None]:
def normalize(X):
  return (X - np.mean(X, axis=0)) / (np.amax(X, axis=0) - np.amin(X, axis=0))

In [None]:
from sklearn.model_selection import train_test_split

w_d, s = preprocessing(w_d)
w_d = pd.DataFrame(w_d, columns=s)  
s.remove('Survived')

w_d[s] = normalize(np.array(w_d[s].values))

train = w_d[w_d.index < dl]
test = w_d[w_d.index >= dl]

X_train = np.array(train[s])
y_train = np.array(train['Survived'])
y_train = y_train.reshape((y_train.shape[0], 1))

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


X_test = np.array(test[s])

In [None]:
w_d

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,-1,0,1,2,Survived
0,-0.500000,0.352559,-0.499812,-0.355997,-0.098724,0.062643,-0.042781,-0.436869,-0.050837,-0.099879,-0.001528,0.301757,-0.206264,-0.093965,0.0
1,-0.499235,-0.647441,-0.499047,0.644003,0.101702,0.062643,-0.042781,-0.435791,0.074147,-0.094503,-0.001528,-0.698243,0.793736,-0.093965,1.0
2,-0.498471,0.352559,-0.498281,0.644003,-0.048618,-0.062357,-0.042781,-0.434713,-0.049520,-0.099879,-0.001528,0.301757,-0.206264,-0.093965,1.0
3,-0.497706,-0.647441,-0.497515,0.644003,0.064122,0.062643,-0.042781,-0.433636,0.038656,-0.089127,-0.001528,0.301757,-0.206264,-0.093965,1.0
4,-0.496942,0.352559,-0.496749,-0.355997,0.064122,-0.062357,-0.042781,-0.432558,-0.049276,-0.099879,-0.001528,0.301757,-0.206264,-0.093965,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,0.496942,0.352559,0.497125,-0.355997,0.000000,-0.062357,-0.042781,0.560976,-0.049276,-0.099879,-0.001528,0.301757,-0.206264,-0.093965,
1305,0.497706,-0.647441,0.497891,0.644003,0.114229,-0.062357,-0.042781,-0.142688,0.147570,0.900121,-0.001528,-0.698243,0.793736,-0.093965,
1306,0.498471,0.352559,0.498656,-0.355997,0.107965,-0.062357,-0.042781,0.562054,-0.050837,-0.099879,-0.001528,0.301757,-0.206264,-0.093965,
1307,0.499235,0.352559,0.499422,-0.355997,0.000000,-0.062357,-0.042781,0.563131,-0.049276,-0.099879,-0.001528,0.301757,-0.206264,-0.093965,


# **The model**

This function calculates the accuracy of the model.

In [None]:
def calc_accuracy(actual, predicted):
  actual = actual.reshape(predicted.shape)
  actual = actual.tolist()
  predicted = predicted.tolist()
  accuracy = 0
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
      accuracy += 1
  return accuracy / len(actual)

This cell runs SVM algorithm for some different values of C parameter and prints the accuracy.

In [None]:
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.svm import SVC

In [None]:
cs = [i for i in range(1, 10)]

best_acc = -1
best_c = 0
for c in cs:
  linsvc = SVC(kernel='poly', degree=c)
  linsvc.fit(X_train,y_train.reshape((y_train.shape[0],)))

  t_p = linsvc.predict(X_train)
  acc = calc_accuracy(y_train, t_p)

  t_p_v = linsvc.predict(X_val)
  acc_v = calc_accuracy(y_val, t_p_v)

  if acc > best_acc:
    best_acc = acc
    best_c = c
  print('When C is: ', c, 'Accuracy in train part is: ', acc, ' and accuracy in val part is : ', acc_v)

When C is:  1 Accuracy in train part is:  0.7874251497005988  and accuracy in val part is :  0.7847533632286996
When C is:  2 Accuracy in train part is:  0.811377245508982  and accuracy in val part is :  0.8071748878923767
When C is:  3 Accuracy in train part is:  0.8188622754491018  and accuracy in val part is :  0.8116591928251121
When C is:  4 Accuracy in train part is:  0.8293413173652695  and accuracy in val part is :  0.8071748878923767
When C is:  5 Accuracy in train part is:  0.8338323353293413  and accuracy in val part is :  0.8161434977578476
When C is:  6 Accuracy in train part is:  0.8203592814371258  and accuracy in val part is :  0.7892376681614349
When C is:  7 Accuracy in train part is:  0.8083832335329342  and accuracy in val part is :  0.7623318385650224
When C is:  8 Accuracy in train part is:  0.812874251497006  and accuracy in val part is :  0.7757847533632287
When C is:  9 Accuracy in train part is:  0.8083832335329342  and accuracy in val part is :  0.77130044843

In [None]:
cs = [0.01, 0.05, 0.1, 0.5, 1]


best_acc_l = -1
best_c_L = 0
for c in cs:
  linsvc = LinearSVC(C=c)
  linsvc.fit(X_train,y_train.reshape((y_train.shape[0],)))

  t_p = linsvc.predict(X_train)
  acc = calc_accuracy(y_train, t_p)

  t_p_v = linsvc.predict(X_val)
  acc_v = calc_accuracy(y_val, t_p_v)

  if acc > best_acc_l:
    best_acc_l = acc
    best_c_L = c
  print('When C is: ', c, 'Accuracy in train part is: ', acc, ' and accuracy in val part is : ', acc_v)

When C is:  0.01 Accuracy in train part is:  0.7949101796407185  and accuracy in val part is :  0.7847533632286996
When C is:  0.05 Accuracy in train part is:  0.7979041916167665  and accuracy in val part is :  0.7802690582959642
When C is:  0.1 Accuracy in train part is:  0.7979041916167665  and accuracy in val part is :  0.7757847533632287
When C is:  0.5 Accuracy in train part is:  0.8008982035928144  and accuracy in val part is :  0.7802690582959642
When C is:  1 Accuracy in train part is:  0.7979041916167665  and accuracy in val part is :  0.7802690582959642


In [None]:
if best_acc_l > best_acc:
  linsvc = LinearSVC(C=best_c_L)
  linsvc.fit(X_train,y_train.reshape((y_train.shape[0],)))
  predicted = linsvc.predict(X_test)
else:
  linsvc = SVC(kernel='poly', degree=best_c)
  linsvc.fit(X_train,y_train.reshape((y_train.shape[0],)))
  predicted = linsvc.predict(X_test)

In [None]:
predicted

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
       1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1.,
       1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 1., 0., 1., 0.