In [68]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [34]:
dataset = "train.csv"
data = pd.read_csv(dataset)
#printing the head
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [35]:
#check the data
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [36]:
#describe the data
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [37]:
#Name, cabin, fare , ticket no, passenger id not req
col_not_req = ["Name", "Fare", "PassengerId", "Ticket", "Cabin", "Survived"]

target = data["Survived"]

#droping the column
X = data.drop(labels = col_not_req, axis = 1)



In [38]:
#finding the NAN values & storing resulting values in the new dataframe 
missing_data = X.isnull()
missing_data.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [39]:
#count missing values in each columns
#value_counts returns the no of unique in list & applying the loop
for column in missing_data.columns.values.tolist():
    print(column)
    print(missing_data[column].value_counts())
    print("  ")

Pclass
False    891
Name: Pclass, dtype: int64
  
Sex
False    891
Name: Sex, dtype: int64
  
Age
False    714
True     177
Name: Age, dtype: int64
  
SibSp
False    891
Name: SibSp, dtype: int64
  
Parch
False    891
Name: Parch, dtype: int64
  
Embarked
False    891
Name: Embarked, dtype: int64
  


In [40]:
#Replacing the missing data with mean
col_list = ["Age"]

#making loop for easy operation
for column in col_list:
    
    #taking average of values from column
    average = X[column].mean()
    
    #replacing the missing values with the average
    X[column].replace(np.nan, average, inplace = True)

In [41]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,22.0,1,0,S
1,1,female,38.0,1,0,C
2,3,female,26.0,0,0,S
3,1,female,35.0,1,0,S
4,3,male,35.0,0,0,S


In [42]:
#Counting values in Embarked
print(X["Embarked"].value_counts())

S    646
C    168
Q     77
Name: Embarked, dtype: int64


In [43]:
#rechecking is any missing value present or not
R_missing = X.isnull()

for col in R_missing.columns.values.tolist():
    print(col)
    print(R_missing[col].value_counts())
    print("  ")

Pclass
False    891
Name: Pclass, dtype: int64
  
Sex
False    891
Name: Sex, dtype: int64
  
Age
False    891
Name: Age, dtype: int64
  
SibSp
False    891
Name: SibSp, dtype: int64
  
Parch
False    891
Name: Parch, dtype: int64
  
Embarked
False    891
Name: Embarked, dtype: int64
  


In [44]:
#one hot encoding for SEX & EMBARKED
#getting the dummy variable
dummy_1 = pd.get_dummies(X["Sex"])
dummy_2 = pd.get_dummies(X["Embarked"])
dummy_1.head()
dummy_2.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [45]:
#Replace column name for clarity
dummy_1.rename(columns={'Sex':'Female', 'Sex':'male'}, inplace=True)
dummy_1.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [46]:
#Replace column name for clarity
dummy_2.rename(columns={'Embarked':'C', 'Embarked':'Q','Embarked':'S' }, inplace=True)
dummy_2.head()

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [47]:
#conacting both the list to X
# merge data frame "df" and "dummy_variable_1" 
X = pd.concat([X, dummy_1], axis=1)

# drop original column "Sex" from "X"
X.drop("Sex", axis = 1, inplace=True)

#for embarked
# merge data frame "X" and "dummy_variable_1" 
X = pd.concat([X, dummy_2], axis=1)

# drop original column "Embarked" from "X"
X.drop("Embarked", axis = 1, inplace=True)

X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,female,male,C,Q,S
0,3,22.0,1,0,0,1,0,0,1
1,1,38.0,1,0,1,0,1,0,0
2,3,26.0,0,0,1,0,0,0,1
3,1,35.0,1,0,1,0,0,0,1
4,3,35.0,0,0,0,1,0,0,1


In [48]:
#data is not normalized
#using simple feature scaling
#list = X.columns.values.tolist()
for column in X.columns.values.tolist():
    X[column] = X[column]/X[column].max()
    
X.head(5)    

Unnamed: 0,Pclass,Age,SibSp,Parch,female,male,C,Q,S
0,1.0,0.275,0.125,0.0,0.0,1.0,0.0,0.0,1.0
1,0.333333,0.475,0.125,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.325,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.333333,0.4375,0.125,0.0,1.0,0.0,0.0,0.0,1.0
4,1.0,0.4375,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [49]:
#hurray! Now data is ready !
#Splitting data into training & testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, target, test_size = 0.2)

In [50]:



#using decision tree classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

depth = []

for i in range (2, 20):
    #it will put value of depth from 2, 19
    clf = DecisionTreeClassifier(max_depth = i)
    scores = cross_val_score(estimator = clf, X = X_train, y = y_train, cv =5, n_jobs =4)
    depth.append((i, scores.mean()))


In [51]:
depth

[(2, 0.7711119866049443),
 (3, 0.7780655963754555),
 (4, 0.8118290160543682),
 (5, 0.8019600118191669),
 (6, 0.8047867625332416),
 (7, 0.7977937555402345),
 (8, 0.7865556978233034),
 (9, 0.7865556978233035),
 (10, 0.7725105880035457),
 (11, 0.7795134443021766),
 (12, 0.7795429922190485),
 (13, 0.7879641485275288),
 (14, 0.7795528415246725),
 (15, 0.778154240126071),
 (16, 0.7879346006106569),
 (17, 0.7823401950162514),
 (18, 0.7837289471092288),
 (19, 0.7795429922190485)]

In [52]:
#for i = 3, score is max hence max_depth = 4
clf = DecisionTreeClassifier(max_depth = 4)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [53]:
#now predicting values
prediction = clf.predict(X_test)

from sklearn.metrics import confusion_matrix

In [54]:
#printing
confusion_matrix(y_test, prediction)

array([[107,   7],
       [ 26,  39]], dtype=int64)

In [69]:
#using random forest classifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
estimator = []

for est in range(2, 20):
    for j in range (2, 20):
        clf_e = RandomForestClassifier(n_estimators = est, max_depth = j)
        scores_e = cross_val_score(estimator = clf_e, X = X_train, y = y_train, cv =5, n_jobs =4)
        estimator.append((est, j, scores_e.mean()))

In [None]:
estimator


In [55]:
#importing test data
dataset_test = "test.csv"
data_test = pd.read_csv(dataset_test)
col_not_req_t = ["Name", "Fare", "PassengerId", "Ticket", "Cabin"]
Test = data_test.drop(labels = col_not_req_t, axis = 1)


In [56]:
missing_datat = Test.isnull()
missing_datat.head(5)

for column in missing_datat.columns.values.tolist():
    print(column)
    print(missing_datat[column].value_counts())
    print("  ")

Pclass
False    418
Name: Pclass, dtype: int64
  
Sex
False    418
Name: Sex, dtype: int64
  
Age
False    332
True      86
Name: Age, dtype: int64
  
SibSp
False    418
Name: SibSp, dtype: int64
  
Parch
False    418
Name: Parch, dtype: int64
  
Embarked
False    418
Name: Embarked, dtype: int64
  


In [57]:
#taking average of values from column
averaget = Test["Age"].mean()
    
#replacing the missing values with the average
Test["Age"].replace(np.nan, averaget, inplace = True)

In [58]:
dummy_t1 = pd.get_dummies(Test["Sex"])
dummy_t2 = pd.get_dummies(Test["Embarked"])
dummy_t1.rename(columns={'Sex':'Female', 'Sex':'male'}, inplace=True)
dummy_t2.rename(columns={'Embarked':'C', 'Embarked':'Q','Embarked':'S' }, inplace=True)

In [59]:
#conacting both the list to X
# merge data frame "df" and "dummy_variable_1" 
Test = pd.concat([Test, dummy_t1], axis=1)

# drop original column "Sex" from "X"
Test.drop("Sex", axis = 1, inplace=True)

#for embarked
# merge data frame "X" and "dummy_variable_1" 
Test = pd.concat([Test, dummy_t2], axis=1)

# drop original column "Embarked" from "X"
Test.drop("Embarked", axis = 1, inplace=True)
Test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,female,male,C,Q,S
0,3,34.5,0,0,0,1,0,1,0
1,3,47.0,1,0,1,0,0,0,1
2,2,62.0,0,0,0,1,0,1,0
3,3,27.0,0,0,0,1,0,0,1
4,3,22.0,1,1,1,0,0,0,1


In [60]:
#doing simple scaling 
for column in Test.columns.values.tolist():
    Test[column] = Test[column]/Test[column].max()
    
Test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,female,male,C,Q,S
0,1.0,0.453947,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,0.618421,0.125,0.0,1.0,0.0,0.0,0.0,1.0
2,0.666667,0.815789,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.355263,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1.0,0.289474,0.125,0.111111,1.0,0.0,0.0,0.0,1.0


In [61]:
#predicting the data
pred = clf.predict(Test)

In [63]:
#saving it into a dataframe
pred_

In [66]:
df = pd.DataFrame( index = pred)

In [67]:
df.to_csv("Submission.csv")