In [6]:
# importing libraries
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

train = pd.read_csv('/mnt/train.csv')
test = pd.read_csv('/mnt/test.csv')

In [7]:
# data prep
y_train = train['Survived']
train.drop(columns=['Survived'], axis = 1, inplace = True)
full = pd.concat([train, test])

# dropping noise
full.drop(columns={'Name', 'Age','SibSp', 'Ticket','Cabin','Parch','Embarked'}, axis = 1, inplace = True)
full = pd.get_dummies(full, columns = ['Sex'])
full.fillna(value=0.0, inplace=True)

In [9]:
# scaling
X_train = full.values[0:891]
X_test = full.values[891:]
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.3, random_state= 42)

In [13]:
learning_rates = [0.05,  0.075, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2]
for lr in learning_rates:
  gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate = lr, max_depth = 2, random_state = 0)
  gb_clf.fit(X_train, y_train)

  print("Learning rate: ", lr)
  print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
  print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.793
Accuracy score (validation): 0.772
Learning rate:  0.075
Accuracy score (training): 0.809
Accuracy score (validation): 0.772
Learning rate:  0.1
Accuracy score (training): 0.809
Accuracy score (validation): 0.772
Learning rate:  0.25
Accuracy score (training): 0.849
Accuracy score (validation): 0.806
Learning rate:  0.5
Accuracy score (training): 0.856
Accuracy score (validation): 0.806
Learning rate:  0.75
Accuracy score (training): 0.868
Accuracy score (validation): 0.806
Learning rate:  1
Accuracy score (training): 0.878
Accuracy score (validation): 0.817
Learning rate:  1.25
Accuracy score (training): 0.872
Accuracy score (validation): 0.780
Learning rate:  1.5
Accuracy score (training): 0.884
Accuracy score (validation): 0.791
Learning rate:  1.75
Accuracy score (training): 0.854
Accuracy score (validation): 0.769
Learning rate:  2
Accuracy score (training): 0.327
Accuracy score (validation): 0.299


In [14]:
# choosing learning rate = 1 as it yielded the best results
gb_clf2 = GradientBoostingClassifier(n_estimators = 20, learning_rate = 1, max_features = 2, max_depth = 2, random_state = 0)
gb_clf2.fit(X_train, y_train)
preds = gb_clf2.predict(X_test)

print('Confusion matrix: ')
print(confusion_matrix(y_test, preds))
print('CLF Report')
print(classification_report(y_test, preds))

Confusion matrix: 
[[139  18]
 [ 36  75]]
CLF Report
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       157
           1       0.81      0.68      0.74       111

    accuracy                           0.80       268
   macro avg       0.80      0.78      0.79       268
weighted avg       0.80      0.80      0.80       268



**XGBOOST**

In [15]:
!pip install xgboost



In [16]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)

score = xgb_clf.score(X_test, y_test)
score

0.7947761194029851

In [20]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

db = loadtxt('/mnt/diabetes.csv', delimiter = ',')

X = db[:, 0:8]
y = db[:, 8]

model = XGBClassifier()
kfold = KFold(n_splits = 10, random_state = None)
results = cross_val_score(model, X, y, cv=kfold)
print(f'Accuracy: {results.mean()*100:.2f}%')

Accuracy: 74.35%


In [21]:
# stratified kfold CV implementation
from sklearn.model_selection import StratifiedKFold
X2 = db[:, 0:8]
y2 = db[:,  8]

model2 = XGBClassifier()
kfold = StratifiedKFold(n_splits = 10, random_state = None)
results = cross_val_score(model, X2, y2, cv=kfold)
print(f'Accuracy: {results.mean()*100:.2f}%')

Accuracy: 73.17%
