In [1]:
#Import libraries
import numpy as np
import pandas as pd

In [2]:
#Read the data set
df1 = pd.read_csv('train.csv')

In [3]:
#Have a look on data set
df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
#Dropping the unnecessary features as cleaning data helps in better prediction
df1.drop(['PassengerId','Ticket','Cabin','Embarked','SibSp','Parch','Name'], axis=1, inplace=True)

In [5]:
df1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [6]:
#Counting null values in dataset
df1.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
dtype: int64

In [7]:
#Describing age
df1['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [8]:
#Filling nulls of age column with its mean for better results
df1['Age'].fillna(df1['Age'].mean(),inplace=True)

In [9]:
#checking if there are any nulls further
df1.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
dtype: int64

In [10]:
#Replacing the sex column with dummies in binary format
l_sex_dummies=pd.get_dummies(df1['Sex'],drop_first=True)

In [11]:
#Concatinating the column
df1=pd.concat([df1,l_sex_dummies],axis=1)

In [12]:
df1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,male
0,0,3,male,22.0,7.25,1
1,1,1,female,38.0,71.2833,0
2,1,3,female,26.0,7.925,0
3,1,1,female,35.0,53.1,0
4,0,3,male,35.0,8.05,1


In [13]:
#Droppin the sex column
df1.drop(['Sex'], axis=1, inplace=True)

In [14]:
df1

Unnamed: 0,Survived,Pclass,Age,Fare,male
0,0,3,22.000000,7.2500,1
1,1,1,38.000000,71.2833,0
2,1,3,26.000000,7.9250,0
3,1,1,35.000000,53.1000,0
4,0,3,35.000000,8.0500,1
...,...,...,...,...,...
886,0,2,27.000000,13.0000,1
887,1,1,19.000000,30.0000,0
888,0,3,29.699118,23.4500,0
889,1,1,26.000000,30.0000,1


In [15]:
#Importing standardscaler function from sklearn library
from sklearn.preprocessing import StandardScaler
sts = StandardScaler()

In [155]:
#Feature scaling Age and fare columns
feature_scale = ['Age','Fare']
df1[feature_scale] = sts.fit_transform(df1[feature_scale])

In [156]:
df1.head()

Unnamed: 0,Survived,Pclass,Age,Fare,male
0,0,3,-0.592481,-0.502445,1
1,1,1,0.638789,0.786845,0
2,1,3,-0.284663,-0.488854,0
3,1,1,0.407926,0.42073,0
4,0,3,0.407926,-0.486337,1


In [157]:
#Dividing the data for training model
y=df1['Survived']
X=df1.drop(['Survived'], axis=1)

In [158]:
#Importing libraries for models
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [159]:
#Creating a dictionary variable of models
model_param = {
    'DecisionTreeClassifier':{
        'model':DecisionTreeClassifier(),
        'param':{
            'criterion': ['gini','entropy']
        }
    },
        'KNeighborsClassifier':{
        'model':KNeighborsClassifier(),
        'param':{
            'n_neighbors': [5,10,15,20,25]
        }
    },
        'SVC':{
        'model':SVC(),
        'param':{
            'kernel':['rbf','linear','sigmoid'],
            'C': [0.1, 1, 10, 100]
         
        }
    }
}

In [161]:
#Iterating a for loop and passing all models in model_param dictionary through GridSearchCV class to get best model among them for this dataset
scores =[]
for model_name, mp in model_param.items():
    model_selection = GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
    model_selection.fit(X,y)
    scores.append({
        'model' : model_name,
        'best_score': model_selection.best_score_,
        'best_params': model_selection.best_params_
    })

In [162]:
#Creating a dataframe to list out accuracies of all models through which data is passed and SVC is giving highest accuracy, so we use it
df_model_score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_model_score

Unnamed: 0,model,best_score,best_params
0,DecisionTreeClassifier,0.780089,{'criterion': 'gini'}
1,KNeighborsClassifier,0.802492,{'n_neighbors': 5}
2,SVC,0.811481,"{'C': 100, 'kernel': 'rbf'}"


In [163]:
svc_model = SVC(C=100, kernel='rbf')

In [164]:
#Fitting data in SVC model
svc_model.fit(X,y)

SVC(C=100)

In [266]:
#Cleaning the test data set
df2=pd.read_csv('test.csv')

In [267]:
df2.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [268]:
#Saving the PassengerId column in variable s
df3=df2
s=df2['PassengerId']

In [269]:
#Dropping the unnecessary features
df3.drop(['PassengerId','Ticket','Cabin','Embarked','SibSp','Parch','Name'], axis=1, inplace=True)
s.head()

0    892
1    893
2    894
3    895
4    896
Name: PassengerId, dtype: int64

In [271]:
df3.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
4,3,female,22.0,12.2875


In [272]:
#Counting missing values
df3.isnull().sum()

Pclass     0
Sex        0
Age       86
Fare       1
dtype: int64

In [273]:
#Describing Age column
df3['Age'].describe()

count    332.000000
mean      30.272590
std       14.181209
min        0.170000
25%       21.000000
50%       27.000000
75%       39.000000
max       76.000000
Name: Age, dtype: float64

In [274]:
#Filling missing values in column with its mean
df3['Age'].fillna(df3['Age'].mean(),inplace=True)
df3['Fare'].fillna(df3['Fare'].mean(),inplace=True)
df3.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,34.5,7.8292
1,3,female,47.0,7.0
2,2,male,62.0,9.6875
3,3,male,27.0,8.6625
4,3,female,22.0,12.2875


In [275]:
#Replacing the sex column with dummies in binary format and concatinating it and dropping sex column 
l_sex_dummies=pd.get_dummies(df3['Sex'],drop_first=True)
df3= pd.concat([df3,l_sex_dummies],axis=1)
df3.drop(['Sex'], axis=1, inplace=True )

In [276]:
df3.head()

Unnamed: 0,Pclass,Age,Fare,male
0,3,34.5,7.8292,1
1,3,47.0,7.0,0
2,2,62.0,9.6875,1
3,3,27.0,8.6625,1
4,3,22.0,12.2875,0


In [277]:
#Feature scaling Age and Fare columns for good results 
df3[feature_scale] = sts.fit_transform(df3[feature_scale])

In [278]:
df3.head()

Unnamed: 0,Pclass,Age,Fare,male
0,3,0.334993,-0.498407,1
1,3,1.32553,-0.513274,0
2,2,2.514175,-0.465088,1
3,3,-0.25933,-0.483466,1
4,3,-0.655545,-0.418471,0


In [280]:
#Storing predicted values in variable y_predicted
y_predicted=svc_model.predict(df3)

In [281]:
#Making a dataframe which holds passengerid and predicted values
submission=pd.DataFrame({
    "PassengerId" : s,
     "Survived": y_predicted
})

In [297]:
#Save this data frame in csv file
submission.to_csv('predictions.csv', index=False)

In [298]:
predicted = pd.read_csv('predictions.csv')

In [295]:
#Accuracy i got after submitting predictions.csv in kaggle is 76.794%