In [79]:
import pandas as pd
import numpy as np

In [80]:
df=pd.read_csv('/content/titanic.csv')

In [81]:
df.shape

(891, 12)

In [82]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


*** Droping Columns That aren't UseFull***

In [84]:
#2. Drop Columns That aren't UseFull
cols = ['Name', 'Ticket', 'Cabin']
df=df.drop(cols, axis=1)

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


***Droping Rows with Missing Values***

In [86]:
#3 Drop Rows with Missing Values
# df=df.dropna()   # dropna() is a function used to remove rows or columns with missing values ( NaN ) from a DataFrame

THE PROBLEM WITH DROPPING ROWS -
After dropping rows with missing values, we find the data set is reduced to 712 rows from 891, which means we are wasting data. Machine learning models need data to train and perform well. So, let’s preserve the data and make use of it as much as we can. More on this below.

In [87]:
#4. Creating Dummy Variables
dummies=[]
cols=['Pclass','Sex','Embarked']
for col in cols:
  dummies.append(pd.get_dummies(df[col]))

titanic_dummies =pd.concat(dummies, axis=1)

In [88]:
#Finally we concatenate to the original data frame, column-wise
df =pd.concat((df,titanic_dummies),axis=1)

In [89]:
#Now that we converted Pclass, Sexand Embarked values into columns, we drop the redundant columns from the data frame.
df=df.drop(['Pclass','Sex','Embarked'],axis=1)


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Age          714 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
 5   Fare         891 non-null    float64
 6   1            891 non-null    uint8  
 7   2            891 non-null    uint8  
 8   3            891 non-null    uint8  
 9   female       891 non-null    uint8  
 10  male         891 non-null    uint8  
 11  C            891 non-null    uint8  
 12  Q            891 non-null    uint8  
 13  S            891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(8)
memory usage: 48.9 KB


In [91]:
#5. Take Care of Missing Data - Everything’s clean now, except Age, which has lots of missing values. Let’s compute a median or interpolate() all the ages and fill those missing age values. Pandas has an interpolate() function that will replace all the missing NaNs to interpolated values.
df['Age']=df['Age'].interpolate()


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
 5   Fare         891 non-null    float64
 6   1            891 non-null    uint8  
 7   2            891 non-null    uint8  
 8   3            891 non-null    uint8  
 9   female       891 non-null    uint8  
 10  male         891 non-null    uint8  
 11  C            891 non-null    uint8  
 12  Q            891 non-null    uint8  
 13  S            891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(8)
memory usage: 48.9 KB


In [93]:
#6. Convert the Data Frame to NumPy
#X= Input set with 14 attributes , y = Small y output, in this case Survived
X=df.values
y=df['Survived'].values

In [94]:
#X still has Survived values in it, which should not be there. So we drop in the NumPy column, which is the first column.
X=np.delete(X,1,axis=1)

In [95]:
df

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,1,2,3,female,male,C,Q,S
0,1,0,22.0,1,0,7.2500,0,0,1,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,3,1,26.0,0,0,7.9250,0,0,1,1,0,0,0,1
3,4,1,35.0,1,0,53.1000,1,0,0,1,0,0,0,1
4,5,0,35.0,0,0,8.0500,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,27.0,0,0,13.0000,0,1,0,0,1,0,0,1
887,888,1,19.0,0,0,30.0000,1,0,0,1,0,0,0,1
888,889,0,22.5,1,2,23.4500,0,0,1,1,0,0,0,1
889,890,1,26.0,0,0,30.0000,1,0,0,0,1,1,0,0


***Dividing the DataSet into Training Data and Test Data***

In [96]:
#7. Divide the Data Set Into Training Data and Test Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.3,random_state=0)

In [97]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model=DecisionTreeClassifier()
model.fit(X_train,y_train)
p=model.predict(X_test)
score=accuracy_score(y_test,p)
print(score)

0.7425373134328358


In [98]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators =100, random_state=42)
model.fit(X_train,y_train)
p=model.predict(X_test)
score=accuracy_score(y_test,p)
print(score)

0.8134328358208955
