In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [None]:
def load_clean_data(path):
  df = pd.read_csv(path)
  df.drop(['PassengerId','Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
  df = df.dropna()
  df.Sex = df.Sex.replace({'female': 0, 'male': 1})
  df.Embarked = df.Embarked.replace({'S': 0, 'C':1, 'Q':2})
  return df



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_data_path = '/content/drive/MyDrive/Colab Notebooks/Domaci2/titanic/train.csv'
test_data_path = '/content/drive/MyDrive/Colab Notebooks/Domaci2/titanic/test.csv'
#ucitavanje
df_train = load_clean_data(train_data_path)
df_test = load_clean_data(test_data_path)

#Model bolje radi kada se izbaci ova kolona
# df_train.drop(['Embarked'], axis=1, inplace=True)
# df_test.drop(['Embarked'], axis=1, inplace=True)

#rasporedjivanje podataka za trening
y_train = df_train.Survived.to_numpy()
df_train.drop(['Survived'], axis=1, inplace=True)
x_train = df_train.to_numpy()

#rasporedjivanje podataka za testiranje
y_test = df_test.Survived.to_numpy()
df_test.drop(['Survived'], axis=1, inplace=True)
x_test = df_test.to_numpy()

#fitovanje stabla odlucivanja
model = RandomForestClassifier(n_estimators=100, random_state = 0)
model.fit(x_train,y_train)

#Evaluacija modela
predictions = model.predict(x_test)

positive_predictions = np.where(predictions == 1)
negative_predictions = np.where(predictions == 0)

TP = np.sum((y_test[positive_predictions] == 1).astype(int))
TN = np.sum((y_test[negative_predictions] == 0).astype(int))
FP = np.sum((y_test[positive_predictions] == 0).astype(int))
FN = np.sum((y_test[negative_predictions] == 1).astype(int))

nb_test = y_test.shape[0]
print('RandomForest')
print('Total test samples: {}'.format(nb_test))
print('TP = {}, TN = {}, FP = {}, FN = {}'.format(TP, TN, FP, FN))


accuracy = (TP + TN) / nb_test
precision = TP / (TP + FP) if TP + FP > 0 else -1
recall = TP / (TP + FN) if TP + FN > 0 else -1
print('A = {:.2f}, P = {:.2f}, R = {:.2f}'.format(accuracy, precision, recall))


#Bagging
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1)

bag.fit(x_train, y_train)

#Evaluacija modela
predictions = bag.predict(x_test)

positive_predictions = np.where(predictions == 1)
negative_predictions = np.where(predictions == 0)

TP = np.sum((y_test[positive_predictions] == 1).astype(int))
TN = np.sum((y_test[negative_predictions] == 0).astype(int))
FP = np.sum((y_test[positive_predictions] == 0).astype(int))
FN = np.sum((y_test[negative_predictions] == 1).astype(int))

nb_test = y_test.shape[0]
print('Bagging')
print('Total test samples: {}'.format(nb_test))
print('TP = {}, TN = {}, FP = {}, FN = {}'.format(TP, TN, FP, FN))


accuracy = (TP + TN) / nb_test
precision = TP / (TP + FP) if TP + FP > 0 else -1
recall = TP / (TP + FN) if TP + FN > 0 else -1
print('A = {:.2f}, P = {:.2f}, R = {:.2f}'.format(accuracy, precision, recall))











RandomForest
Total test samples: 331
TP = 97, TN = 172, FP = 32, FN = 30
A = 0.81, P = 0.75, R = 0.76
Bagging
Total test samples: 331
TP = 99, TN = 177, FP = 27, FN = 28
A = 0.83, P = 0.79, R = 0.78
