In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# Read data

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()  

In [None]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.countplot(train_data.Survived)
plt.title('Number of passenger Survived');

In [None]:
sns.heatmap(train_data.corr(), annot=True)

# data cleaning

In [None]:
#find missing value
train_data.isna().sum()

In [None]:
test_data.isna().sum()

In [None]:
#Dropping meanningless data,fill missing value with mean value and change the data into numerical format
train_data.drop(columns=["Name","Cabin","Ticket"],inplace = True)
test_data.drop(columns=["Name","Cabin","Ticket"],inplace = True)

train_data['Age'].fillna((train_data['Age'].mean()), inplace=True)
test_data['Age'].fillna((train_data['Age'].mean()), inplace=True)

train_data.fillna({"Embarked":"Q"},inplace=True)
test_data.fillna({"Embarked":"Q"},inplace=True)

train_data['Fare'].fillna((train_data['Fare'].mean()), inplace=True)
test_data['Fare'].fillna((train_data['Fare'].mean()), inplace=True)

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
test_data.head()

In [None]:
train_data.head()

# Train and Test data split

In [None]:
from sklearn.ensemble import RandomForestClassifier

y_train = train_data["Survived"]

x_train = train_data.loc[:, train_data.columns != 'Survived']
x_train = pd.get_dummies(x_train)

x_test = pd.get_dummies(test_data)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state=2)

# Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

#R-Squared Score
print("R-Squared for Train set: {:.3f}".format(linreg.score(X_train, y_train)))
print("R-Squared for test set: {:.3f}" .format(linreg.score(X_test, y_test)))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=10000, C=50)
logreg.fit(X_train, y_train)

#R-Squared Score
print("R-Squared for Train set: {:.3f}".format(logreg.score(X_train, y_train)))
print("R-Squared for test set: {:.3f}" .format(logreg.score(X_test, y_test)))

# KNN Method

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knnclf = KNeighborsClassifier(n_neighbors=7)

# Train the model using the training sets
knnclf.fit(X_train, y_train)
y_pred = knnclf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",accuracy_score(y_test, y_pred))

# Support Vector Machine(SVM)

In [None]:
from sklearn.svm import LinearSVC

svmclf = LinearSVC(C=50)
svmclf.fit(X_train, y_train)

print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(svmclf.score(X_train, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(svmclf.score(X_test, y_test)))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtclf = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(dtclf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(dtclf.score(X_test, y_test)))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_clf = RandomForestClassifier(random_state = 2, criterion = 'gini', max_depth = 6, max_features = 'auto', n_estimators = 100)
rf_clf.fit(X_train, y_train)
predictions = rf_clf.predict(X_test)


accuracy_score(y_test, predictions)

# MinMax Method 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

# we must apply the scaling to the test set that we computed for the training set
X_test_scaled = scaler.transform(X_test)

In [None]:
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_scaled, y_train)

#R-Squared Score
print("R-Squared for Train set: {:.3f}".format(logreg.score(X_train_scaled, y_train)))
print("R-Squared for test set: {:.3f}" .format(logreg.score(X_test_scaled, y_test)))

In [None]:
knnclf = KNeighborsClassifier(n_neighbors=7)

# Train the model using the scaled training sets
knnclf.fit(X_train_scaled, y_train)
y_pred = knnclf.predict(X_test_scaled)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",accuracy_score(y_test, y_pred))

# Support Vector Machine with RBF kernel

In [None]:
from sklearn.svm import SVC

svcclf = SVC(gamma=0.1)
svcclf.fit(X_train, y_train)

print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(svcclf.score(X_train, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(svcclf.score(X_test, y_test)))

In [None]:
svmclf = LinearSVC()
svmclf.fit(X_train_scaled, y_train)

print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(svmclf.score(X_train_scaled, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(svmclf.score(X_test_scaled, y_test)))