# **TITANIC DATASET**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_id = test["PassengerId"]

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
def clean(dataset):
  dataset = dataset.drop(["Ticket","Name","Cabin","PassengerId"],axis=1)

  cols = ["SibSp","Parch","Fare","Age"]
  for cols in cols:
    dataset[cols].fillna(dataset[cols].median(),inplace=True)

  dataset.Embarked.fillna("U",inplace=True)
  return dataset

In [6]:
dataset = clean(dataset)
test = clean(test)

In [7]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [8]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()

cols = ["Sex","Embarked"]
for cols in cols:
  dataset[cols] = le.fit_transform(dataset[cols])
  test[cols] = le.transform(test[cols])

In [9]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [10]:
y = dataset["Survived"]
x = dataset.drop("Survived",axis=1)

In [11]:
from sklearn.model_selection import train_test_split
x_train ,x_test ,y_train ,y_test = train_test_split(x,y,test_size=0.2 , random_state=1)

In [12]:
from sklearn.ensemble import RandomForestClassifier
ct = RandomForestClassifier(n_estimators = 100 , max_depth = 5 , random_state =1)
ct.fit(x_train , y_train)

RandomForestClassifier(max_depth=5, random_state=1)

In [13]:
training_prediction = ct.predict(x_test)

In [14]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,training_prediction)
print(acc)

0.7821229050279329


In [15]:
submission_pred = ct.predict(test)

In [16]:
    df = pd.DataFrame({"PassengerId": test_id.values,
                       "Survived":submission_pred,
                       })

In [17]:
df.to_csv("Submission.csv", index=False)

## **Another way**

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [19]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_id = test["PassengerId"]

In [20]:
print(test)

     PassengerId  Pclass                                          Name  \
0            892       3                              Kelly, Mr. James   
1            893       3              Wilkes, Mrs. James (Ellen Needs)   
2            894       2                     Myles, Mr. Thomas Francis   
3            895       3                              Wirz, Mr. Albert   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
..           ...     ...                                           ...   
413         1305       3                            Spector, Mr. Woolf   
414         1306       1                  Oliva y Ocana, Dona. Fermina   
415         1307       3                  Saether, Mr. Simon Sivertsen   
416         1308       3                           Ware, Mr. Frederick   
417         1309       3                      Peter, Master. Michael J   

        Sex   Age  SibSp  Parch              Ticket      Fare Cabin Embarked  
0      male  34.5      0      0 

In [21]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
women = data.loc[data.Sex == "female"]["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived = ",rate_women)

% of women who survived =  0.7420382165605095


In [23]:
men = data.loc[data.Sex == "male"]["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived = ",rate_men)

% of men who survived =  0.18890814558058924


In [24]:
from sklearn.ensemble import RandomForestClassifier

y = data["Survived"]
features = ["Pclass","Sex","Parch","SibSp"]
x = pd.get_dummies(data[features])
x_test = pd.get_dummies(test[features])


model = RandomForestClassifier(n_estimators = 100, max_depth = 5 , random_state =1)
model.fit(x,y)


RandomForestClassifier(max_depth=5, random_state=1)

In [25]:
print(x)

     Pclass  Parch  SibSp  Sex_female  Sex_male
0         3      0      1           0         1
1         1      0      1           1         0
2         3      0      0           1         0
3         1      0      1           1         0
4         3      0      0           0         1
..      ...    ...    ...         ...       ...
886       2      0      0           0         1
887       1      0      0           1         0
888       3      2      1           1         0
889       1      0      0           0         1
890       3      0      0           0         1

[891 rows x 5 columns]


In [26]:
predictions = model.predict(x_test)

In [27]:
df = pd.DataFrame({"PassengerId":test_id,
                   "Survived":predictions,
                   })

In [28]:
df.to_csv("Submission3.csv", index = False)

## **Another sample**

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [30]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_id = test["PassengerId"]

In [31]:
sum(data["Age"].isnull())

177

In [32]:
def clean(dataset):
 # dataset = dataset.drop(["Ticket","Name","Cabin","PassengerId"],axis=1)

  cols = ["Age"]
  for cols in cols:
    dataset[cols].fillna(dataset[cols].median(),inplace=True)

  #dataset.Embarked.fillna("U",inplace=True)
  return dataset

In [33]:
data = clean(data)

In [34]:
sum(data["Age"].isnull())

0

In [35]:
len(data["Age"])

891

In [36]:
len(data["Survived"])

891

In [37]:
x_train = data[["Age"]]
y_train = data[["Survived"]]

In [38]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)

DecisionTreeClassifier()

In [39]:
test = clean(test)

In [40]:
test_id = test["PassengerId"]

In [41]:
x_test = test[["Age"]]

In [42]:
prediction = dtc.predict(x_test)
print(prediction)

[0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1
 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0
 1 0 0 1 0 1 1 0 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 1
 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0
 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0
 0 0 0 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1 1 0 0 0 1 0 1 1 1 0 1 1 0 0
 1 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1
 0 1 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
 0 1 1 1 0 0 1 0 0 1 1]


In [43]:
len(prediction)

418

In [44]:
df = pd.DataFrame({"PassengerId":test_id,
                   "Survived":prediction,
                   })

In [45]:
df.to_csv("Submission4.csv",index=False)