In [3]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [4]:
titanic = pd.read_csv("0000000000002429_training_titanic_x_y_train.csv")
titanic.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,668.0,536.0,668.0,668.0,668.0,668.0
mean,2.296407,29.70056,0.528443,0.407186,32.064552,0.402695
std,0.831638,14.240257,1.080327,0.854695,45.320835,0.490808
min,1.0,0.67,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.925,0.0
50%,3.0,29.0,0.0,0.0,14.75,0.0
75%,3.0,38.25,1.0,0.0,31.275,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


In [5]:
#dropping unimportant columns
titanic.drop("Name",axis=1,inplace = True)
titanic.drop("Ticket",axis=1,inplace = True)
titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,female,29.0,1,0,26.0,,S,1
1,3,male,,0,0,8.05,,S,0
2,2,male,39.0,0,0,26.0,,S,0
3,3,female,29.0,0,4,21.075,,S,0
4,3,male,25.0,0,0,7.05,,S,0


In [6]:
#finding number of null values
print(titanic.isnull().sum())
titanic.shape

Pclass        0
Sex           0
Age         132
SibSp         0
Parch         0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64


(668, 9)

In [7]:
#since most of the values of Cabin are null
#delete the column Cabin

del titanic["Cabin"]

In [8]:
titanic.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,2,female,29.0,1,0,26.0,S,1
1,3,male,,0,0,8.05,S,0
2,2,male,39.0,0,0,26.0,S,0
3,3,female,29.0,0,4,21.075,S,0
4,3,male,25.0,0,0,7.05,S,0


In [9]:
def embarked(s):
    if s == "S":
        return 0
    elif s == "C":
        return 1
    else:
        return 2

def sex(s):
    if s=="female":
        return 1
    else:
        return 0
    
titanic["Gender"] = titanic.Sex.apply(sex)
titanic["Em"] = titanic.Embarked.apply(embarked)
del titanic["Sex"]
del titanic["Embarked"]
titanic.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived,Gender,Em
0,2,29.0,1,0,26.0,1,1,0
1,3,,0,0,8.05,0,0,0
2,2,39.0,0,0,26.0,0,0,0
3,3,29.0,0,4,21.075,0,1,0
4,3,25.0,0,0,7.05,0,0,0


In [10]:
#rearranging columns
titanic = titanic[["Pclass","Gender","Age","SibSp","Parch","Fare","Em","Survived"]]
titanic.head()

Unnamed: 0,Pclass,Gender,Age,SibSp,Parch,Fare,Em,Survived
0,2,1,29.0,1,0,26.0,0,1
1,3,0,,0,0,8.05,0,0
2,2,0,39.0,0,0,26.0,0,0
3,3,1,29.0,0,4,21.075,0,0
4,3,0,25.0,0,0,7.05,0,0


In [11]:
#replacing Nan of Age by its mean
titanic.Age.fillna(titanic.Age.mean(),inplace = True)
titanic.head()

Unnamed: 0,Pclass,Gender,Age,SibSp,Parch,Fare,Em,Survived
0,2,1,29.0,1,0,26.0,0,1
1,3,0,29.70056,0,0,8.05,0,0
2,2,0,39.0,0,0,26.0,0,0
3,3,1,29.0,0,4,21.075,0,0
4,3,0,25.0,0,0,7.05,0,0


In [12]:
#trying to fit data better by introducing dummy variables
titanic["age*age"] = titanic.Age*titanic.Age
titanic["gen*gen"] = titanic.Gender*titanic.Gender
titanic["s*s"] = titanic.SibSp*titanic.SibSp
titanic["p*p"] = titanic.Parch*titanic.Parch
titanic["Em*Em"] = titanic.Em*titanic.Em
titanic["pclass*pclass"] = titanic.Pclass*titanic.Pclass
titanic["fare*fare"] = titanic.Fare*titanic.Fare

titanic["1"] = titanic.Pclass*titanic.Gender
titanic["2"] = titanic.Pclass*titanic.Age
titanic["3"] = titanic.Pclass*titanic.SibSp
titanic["4"] = titanic.Pclass*titanic.Parch
titanic["5"] = titanic.Pclass*titanic.Fare
titanic["6"] = titanic.Pclass*titanic.Em

titanic["7"] = titanic.Gender*titanic.Age
titanic["8"] = titanic.Gender*titanic.SibSp
titanic["9"] = titanic.Gender*titanic.Parch
titanic["10"] = titanic.Gender*titanic.Fare
titanic["11"] = titanic.Gender*titanic.Em


titanic = titanic[["Pclass","Gender","Age","SibSp","Parch","Fare","Em","age*age","gen*gen","s*s","p*p","Em*Em","1","2","3","4","5","6","7","8","9","10","11","Survived"]]
titanic.head()

Unnamed: 0,Pclass,Gender,Age,SibSp,Parch,Fare,Em,age*age,gen*gen,s*s,...,3,4,5,6,7,8,9,10,11,Survived
0,2,1,29.0,1,0,26.0,0,841.0,1,1,...,2,0,52.0,0,29.0,1,0,26.0,0,1
1,3,0,29.70056,0,0,8.05,0,882.123247,0,0,...,0,0,24.15,0,0.0,0,0,0.0,0,0
2,2,0,39.0,0,0,26.0,0,1521.0,0,0,...,0,0,52.0,0,0.0,0,0,0.0,0,0
3,3,1,29.0,0,4,21.075,0,841.0,1,0,...,0,12,63.225,0,29.0,0,4,21.075,0,0
4,3,0,25.0,0,0,7.05,0,625.0,0,0,...,0,0,21.15,0,0.0,0,0,0.0,0,0


In [13]:
#apply logistic Regression

titanic_data = np.array(titanic)
x = titanic_data[:,0:-1]
y = titanic_data[:,-1].reshape(-1,1)
x.shape,y.shape

((668, 23), (668, 1))

In [14]:
clf = LogisticRegression()
clf.fit(x,y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
clf.predict(x)
clf.score(x,y)

0.81736526946107779

In [None]:
#starting with the testing data
test = pd.read_csv("0000000000002429_test_titanic_x_test.csv")

In [None]:
#dropping unimportant columns
test.drop("Name",axis=1,inplace = True)
test.drop("Ticket",axis=1,inplace = True)
test.head()

In [None]:
del test["Cabin"]
test.head()

In [None]:
def embarked(s):
    if s == "S":
        return 0
    elif s == "C":
        return 1
    else:
        return 2

def sex(s):
    if s=="female":
        return 1
    else:
        return 0
    
test["Gender"] = test.Sex.apply(sex)
test["Em"] = test.Embarked.apply(embarked)
del test["Sex"]
del test["Embarked"]
test.head()

In [None]:
test = test[["Pclass","Gender","Age","SibSp","Parch","Fare","Em"]]
test.Age.fillna(test.Age.mean(),inplace = True)
test.head()
test.shape

In [None]:
test["age*age"] = test.Age*test.Age
test["gen*gen"] = test.Gender*test.Gender
test["s*s"] = test.SibSp*test.SibSp
test["p*p"] = test.Parch*test.Parch
test["Em*Em"] = test.Em*test.Em

test["1"] = test.Pclass*test.Gender
test["2"] = test.Pclass*test.Age
test["3"] = test.Pclass*test.SibSp
test["4"] = test.Pclass*test.Parch
test["5"] = test.Pclass*test.Fare
test["6"] = test.Pclass*test.Em

test["7"] = test.Gender*test.Age
test["8"] = test.Gender*test.SibSp
test["9"] = test.Gender*test.Parch
test["10"] = test.Gender*test.Fare
test["11"] = test.Gender*test.Em


test = test[["Pclass","Gender","Age","SibSp","Parch","Fare","Em","age*age","gen*gen","s*s","p*p","Em*Em","1","2","3","4","5","6","7","8","9","10","11"]]

test.head()

In [None]:
#test_data = np.array(test)
ypred = clf.predict(test)
print(ypred)
np.savetxt("titanic_result.csv",ypred,delimiter = ",")
ypred.shape,
# don't have test labels as the data is from kaggle, so can't calculate accuracy