In [28]:
import pandas as pd

class Titanic:
    def __init__(self, filename, uselessfeatures,categoricalfeatures,targetfeature):
        self.df = pd.read_csv(filename, delimiter=',')
        self.uselessfeatures = uselessfeatures
        self.categoricalfeatures = categoricalfeatures
        self.targetfeature = targetfeature
        
    def showtypes(self):
        print(self.df.dtypes)
        
    def missingdata(self):
        print(round(self.df.isnull().sum()*100/len(self.df),2).sort_values(ascending=False))
        
    def dummymaker(self):
        for categoricalfeature in self.categoricalfeatures:
            encodedfeature = pd.get_dummies(self.df[categoricalfeature],prefix=categoricalfeature,drop_first=True)
            self.df = pd.concat([self.df,encodedfeature],axis=1)
            self.df.drop([categoricalfeature],axis=1, inplace=True)
            
    def removeuselessfeatures(self):
        for uselessfeature in self.uselessfeatures:
            self.df.drop([uselessfeature],axis=1, inplace=True)
            
    def Xy(self):
        y = self.df[self.targetfeature]
        X = self.df[self.df.columns.difference([self.targetfeature])]
        return X, y
    
    def X(self):
        X = self.df
        return X
        
    def removemissingdata(self):
        self.df = self.df.dropna()
        
    def fillna(self, column):
        self.df[column].fillna(self.df[column].mean(), inplace = True) 
        
    def showpassenger(self, i):
        print(self.df.iloc[i])
             
        
        

In [29]:
titanictrain = Titanic(filename = "train.csv", uselessfeatures = ['Name','Ticket','PassengerId','Cabin'], categoricalfeatures = ['Sex','Pclass','Embarked'], targetfeature = 'Survived')

#titanictrain.showtypes()
titanictrain.missingdata()
titanictrain.dummymaker()
titanictrain.removeuselessfeatures()
titanictrain.fillna("Age")
titanictrain.removemissingdata()
#titanictrain.showpassenger(15)
X, y = titanictrain.Xy()

Cabin          77.10
Age            19.87
Embarked        0.22
Fare            0.00
Ticket          0.00
Parch           0.00
SibSp           0.00
Sex             0.00
Name            0.00
Pclass          0.00
Survived        0.00
PassengerId     0.00
dtype: float64


In [30]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.3, random_state=100)

In [31]:
from sklearn import svm
clf = svm.SVC()
clf.fit(Xtrain, ytrain)
print(clf.score(Xtest, ytest))

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(Xtrain, ytrain)
print(clf.score(Xtest, ytest))

0.7126865671641791
0.7873134328358209
0.7126865671641791


In [34]:
titanictest = Titanic(filename = "test.csv", uselessfeatures = ['Name','Ticket','PassengerId','Cabin'], categoricalfeatures = ['Sex','Pclass','Embarked'], targetfeature = None)

dfpassengerid = titanictest.df['PassengerId']
#titanictest.showtypes()
titanictest.fillna("Age")
titanictest.fillna("Fare")
titanictest.removeuselessfeatures()
titanictest.dummymaker()
#titanictest.missingdata()
Xtest = titanictest.X()
ypred = clf.predict(Xtest)

dfypred = pd.DataFrame({'Survived': ypred})
dfsubmission = pd.concat([dfpassengerid,dfypred],axis=1)
dfsubmission.to_csv('out.csv', index = False)


Embarked_S    0.0
Embarked_Q    0.0
Pclass_3      0.0
Pclass_2      0.0
Sex_male      0.0
Fare          0.0
Parch         0.0
SibSp         0.0
Age           0.0
dtype: float64
