In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors  import KNeighborsClassifier

# **Reading the training and the testing data**

In [6]:
train= pd.read_csv("../input/titanic/train.csv")
pred=pd.read_csv("../input/titanic/test.csv", header=0)

# **Displaying the top 10 data in training and testing data**

In [7]:
train.head()

In [8]:
pred.head()

In [9]:
#number of rows in test and train
print("Number of rows in Train dataset: ",train.shape[0], " Number of rows in Test dataset: ", pred.shape[0])

In [10]:
#number of columns in test and train
print("Number of columns in Train dataset: ",pred.shape[1], " Number of columns in Test dataset: ", pred.shape[1])

In [11]:
#info in train
train.info()

In [12]:
# Decribe the train
train.describe(include="all")

# **Cleaning the dataset:**
1.     Removing the Null values
2.     Dropping the unwanted columns
3.     Adding New columns if needed
4.     Formatting the columns

In [16]:
#to find the percentage of null:
def percentagenull(df):
    percentage= ((df.isna().sum()/df.isna().count())*100).sort_values(ascending=False)
    count= df.isna().sum().sort_values(ascending=False)
    dfff= pd.concat([count, percentage], axis=1,keys=['the Count', 'Percentage of null'])
    return dfff
    

In [17]:
percentagenull(train)

In [18]:
percentagenull(pred)

In [19]:
#dropping the column cabin because it has more percentage of null and the cabin can be found from the fare rate and pclass
train.drop('Cabin', axis=1, inplace=True)
pred.drop('Cabin', axis=1, inplace=True)

In [20]:
#fill up or replace the null in the age with the mean of age
train["Age"].replace(np.nan, train["Age"].mean(), inplace=True)
train["Age"].isna().sum()

In [21]:
#replace or fill the Embarkment with the mode
train["Embarked"].replace(np.nan, train.Embarked.mode()[0], inplace=True)
train["Embarked"].isna().sum()

In [22]:
pred["Embarked"].replace(np.nan, pred.Embarked.mode()[0], inplace=True)
pred["Age"].replace(np.nan, pred["Age"].mean(), inplace=True)
pred["Fare"].replace(np.nan, pred["Fare"].mean(),inplace=True)

In [23]:
pred.isna().sum()

In [24]:
#Dropping the Ticket and Name because both has no recurring pattern
train.drop(['Ticket','Name'], axis=1, inplace=True)
pred.drop(['Ticket','Name'], axis=1, inplace=True)


# **Simple visualization of Sex and the Survival count**

In [25]:
dd=train[['Sex', 'Survived']].groupby(['Survived'], as_index=False).count()
dd.plot(kind='bar',figsize=(10, 5))
plt.xlabel("Sex")
plt.ylabel("Survived")
plt.show()


In [26]:
#formating the columns sex and embarked
train=pd.get_dummies(train, prefix=["Sex","Embarked"])
pred=pd.get_dummies(pred, prefix=["Sex","Embarked"])


# **Adding a new column Relatives**

In [28]:
# combining the SibSp and Parch together into one relative column and dropping the two columns
def merges(df):
    relatives=[]
    for i in df["SibSp"].values.tolist():
        relatives.append(i)
    relatives1=[]
    for i in df["Parch"].values.tolist():
        relatives1.append(i)    
    re=[]

    for i in range(0, len(relatives)):
        re.append(relatives[i]+relatives1[i])  
    df1=pd.DataFrame(re, columns=['Relatives'])   
    df= pd.concat([df, df1], axis=1)
    return df

In [29]:
train=merges(train)
train.drop(['SibSp','Parch'], axis=1, inplace=True)

In [30]:
train.head()

In [31]:
pred=merges(pred)
pred.drop(['SibSp','Parch'], axis=1, inplace=True)
pred.head()

# **Feature Importance:**
1.     Using the ExtraTreeClassifier feature importance
2.     Using heatmap to detect the correlaion between features

In [32]:
x=train.drop('Survived', axis=1)
y=train['Survived']

In [33]:
model=ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_)

In [34]:
fi=pd.Series(model.feature_importances_, index=x.columns)
fi.plot(kind="barh")

In [35]:
#heatmap for the corelation
plt.figure(figsize=(40,15))
sns.heatmap(train.corr(), annot=True)

In [36]:
sns.regplot(train['PassengerId'], train['Survived'], data=train)

In [37]:
#Dropping passengerId as we can see the correlation of passengerId is near to zero with respect to Survived
train.drop('PassengerId', axis=1, inplace=True)

# **Preprocessing the data**

In [43]:
X_train, X_test, y_train, y_test= train_test_split(x, y, test_size=20, random_state=1)
X_train=StandardScaler().fit_transform(X_train)
X_test=StandardScaler().fit_transform(X_test)
pred=StandardScaler().fit_transform(pred)

# **Logistic Regression:**

In [41]:
lr = LogisticRegression()
params = { "penalty": ("l1", "l2", "elasticnet"), "tol": (0.1, 0.01, 0.001, 0.0001), "C": (10.0, 1.0, 0.1, 0.01)}
modelLR = GridSearchCV(lr, params, cv=10)
modelLR.fit(X_train, y_train)

In [42]:
print(accuracy_score(modelLR.predict(X_test),y_test))

# **SVM:**

In [45]:
modelSVM=svm.SVC(kernel='rbf')
modelSVM.fit(X_train, y_train)

In [46]:
print(accuracy_score(modelSVM.predict(X_test),y_test))

# **Decision Tree Classifier**

In [48]:
modelDTC=DecisionTreeClassifier(criterion="entropy")
modelDTC.fit(X_train, y_train)

In [49]:
print(accuracy_score(modelDTC.predict(X_test),y_test))

# **KNeighborsClassifier**

In [50]:
n=KNeighborsClassifier(n_neighbors=4)
n.fit(X_train, y_train)

In [51]:
print(accuracy_score(n.predict(X_test),y_test))

In [63]:
report = pd.DataFrame({
    "Model" : ["Logistic Regression:", "SVM", "Decision Tree Classifier", "KNeighborsClassifier"],
    "Accuracy score" : [accuracy_score(modelLR.predict(X_test),y_test), accuracy_score(modelSVM.predict(X_test),y_test),accuracy_score(modelDTC.predict(X_test),y_test),accuracy_score(n.predict(X_test),y_test)]
})
report.sort_values(by = "Accuracy score")

# **Prediction using Logistic Regression:**

In [64]:
prediction= clf.predict(pred)

In [65]:
prediction

In [67]:
pred1=pd.read_csv('../input/titanic/test.csv')
sub=pd.DataFrame({'PassengerID': pred1["PassengerId"], 'Survived': prediction})

In [68]:
sub

In [69]:
sub.to_csv("submission.csv", index=False)

# **THANK YOU SO MUCH FOR VIEWING MY NOTEBOOK. YOUR FEEDBACK IS MORE IMPORTANT FOR MY IMPROVEMENT!**