<a href="https://colab.research.google.com/github/Nataliia5722/AI/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

[External data: Local Files, Drive, Sheets, and Cloud Storage](https://colab.research.google.com/notebooks/io.ipynb?authuser=2)


Mounting Google Drive locally

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv("https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/fa71405126017e6a37bea592440b4bee94bf7b9e/titanic.csv")

run

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import torch
print(f'matplotlib: {matplotlib.__version__}')
print(f'pytorch   : {torch.__version__}')
print(f'pandas    : {pd.__version__}')
print(f'numpy     : {np.__version__}')

### The formatting

- One-hot encode: 'Sex', 'Embarked'
- Remove: 'Name', 'Ticket', 'Cabin'
- Fill null values with the mean of the associated column.

In [None]:
from sklearn import preprocessing

def data_normalizer(features):
    x = features.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    return pd.DataFrame(x_scaled, columns=features.columns)

# Apply some data formatting
def format_data(data):
    # One-hot encode 'Embarked' column
    data = pd.get_dummies(data, columns=['Sex','Embarked'])
    # Drop columns that require additional processing
    data = data.drop(['Name','Ticket','Cabin'], axis=1)
    # Fill null values with the mean of the column
    data.fillna(data.mean(), inplace=True)
    if 'Survived' in data.columns:
        labels = data['Survived']
        X = data.drop(['Survived'], axis=1)
        X = data_normalizer(X)
        return X, labels
    else:
        return data_normalizer(data)

# This should split the data into our features and our labels
features, labels = format_data(data)
features.describe()

In [None]:
features.head()

### Split on train and test

In [None]:
# Split the data set into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=2, stratify=labels)

In [None]:
y_train.shape

In [None]:
y_test.shape

### Prepare inputs for model

In [None]:
# Format the data into PyTorch tensors
X_train = torch.FloatTensor(X_train.values)
X_test = torch.FloatTensor(X_test.values)
y_train = torch.LongTensor(y_train.values)
y_test = torch.LongTensor(y_test.values)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(torch.nn.Module):
    
    def __init__(self, input_features):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_features, 270)
        self.bn1 = nn.BatchNorm1d(270)
        self.fc2 = nn.Linear(270, 50)
        self.bn2 = nn.BatchNorm1d(50)
        self.fc3 = nn.Linear(50, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.dropout(x, p=0.1) 
        x = F.relu(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

In [None]:
model = Model(X_train.shape[1]).to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1, betas=(0.9, 0.99))
criterion = nn.CrossEntropyLoss()

In [None]:
# При данных зачениях (и те, что закомментированы), значение accuracy равняется 0.83
batch_size = 25
num_epochs = 20
learning_rate = 0.02
#batch_size = 39
#num_epochs = 20
#learning_rate = 0.05
#batch_size = 90
#num_epochs = 90
#learning_rate = 0.1
batch_no = len(X_train) // batch_size
print(batch_no)

In [None]:
train_loss = np.zeros((num_epochs*batch_no,))
train_accuracy = np.zeros((num_epochs*batch_no,))
valid_loss = np.zeros((num_epochs*batch_no,))
valid_accuracy = np.zeros((num_epochs*batch_no,))

In [None]:
import torch.nn as nn
loss_fn   = nn.CrossEntropyLoss()

In [None]:
p=0

for epoch in range(num_epochs):
    if epoch % 5 == 0:
        print('Epoch {}'.format(epoch+1))
    # x_train, y_train = shuffle(X_train, y_train)
    x_train = X_train.to(device) # needs assignment
    y_train = y_train.to(device) # needs assignment
    # Mini batch learning
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        x_var = x_train[start:end]
        y_var = y_train[start:end]
        #Backward + Optimize
        optimizer.zero_grad()
        pred = model(x_var)
        loss =criterion(pred, y_var)

        train_loss[p] = loss.item()
        train_correct = (torch.argmax(pred, dim=1) == y_var).type(torch.FloatTensor)
        train_accuracy[p] = train_correct.mean()

        loss.backward()
        optimizer.step()
        p+=1
        with torch.no_grad():
          y_pred = model(X_test)
          loss = loss_fn(y_pred, y_test)
          valid_loss[epoch] = loss.item()
          correct = (torch.argmax(y_pred, dim=1) == y_test).type(torch.FloatTensor)
          valid_accuracy[epoch] = correct.mean()

In [None]:
# Evaluate the model
test_var =  X_test.to(device) # needs assignment 
with torch.no_grad():
    result = model(test_var)
values, labels = torch.max(result, 1)
num_right = np.sum(labels.data.cpu().numpy() == y_test.cpu().numpy())
print('Accuracy {:.2f}'.format(num_right / len(y_test)))


In [None]:
fig, [ax1, ax2] = plt.subplots(2, figsize=[12, 6], sharex=True)

ax1.plot(train_accuracy)
ax1.set_ylabel('train_accuracy')
ax2.plot(train_loss)
ax2.set_ylabel('train_loss')
ax2.set_xlabel("epochs")

In [None]:
fig, (ax1, ax2) = plt.subplots(2, figsize=(12, 6), sharex=True)

ax1.plot(valid_accuracy)
ax1.set_ylabel("valid_accuracy")
ax2.plot(valid_loss)
ax2.set_ylabel("valid_loss")
ax2.set_xlabel("epochs");

In [None]:
y_pred = model(X_test)
y_hat=torch.argmax(y_pred, dim=1)
len(y_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix
# confusion matrix
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]
class_names=[   "Iris-setosa", "Iris-versicolor", "Iris-virginica"]
from sklearn.metrics import confusion_matrix
conf=confusion_matrix(y_test, y_hat)
conf

In [None]:

# Precision and recall
from sklearn.metrics import precision_score, recall_score,f1_score
print(f"precision: {precision_score(y_test, y_hat, average='weighted')}")
print(f"recall: {recall_score(y_test, y_hat, average='weighted')}")
print(f"f1 score: {f1_score(y_test, y_hat, average='weighted')}")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_hat,)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_hat,  average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_hat, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_hat, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_hat, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_hat, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_hat, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_hat,average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_hat, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_hat, average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_hat))