In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons

In [2]:
moons = make_moons()

In [3]:
features = moons[0]
labels = moons[1]

In [4]:
from sklearn.model_selection import train_test_split

In [60]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [37]:
log_clf = LogisticRegression()
rdc_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('rf', rdc_clf), ('sc', svm_clf)],
        voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('sc', SVC())])

In [38]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rdc_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, ':', accuracy_score(y_pred, y_test))

LogisticRegression : 0.9
RandomForestClassifier : 1.0
SVC : 1.0
VotingClassifier : 1.0


In [15]:
train = pd.read_csv('titanic_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

In [26]:
train.shape

(891, 7)

In [18]:
train_features = train.drop(['Survived'], axis=1)
train_label = train['Survived']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(train_features, train_label, test_size=0.2)

In [32]:
log_clf = LogisticRegression(C = 10, max_iter = 100)
rdc_clf = RandomForestClassifier()
svm_clf = SVC(C= 0.1, kernel= 'linear')

In [33]:
voting_clf = VotingClassifier(
        estimators=[('lr', log_clf), ('rf', rdc_clf), ('sc', svm_clf)],
        voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=10)),
                             ('rf', RandomForestClassifier()),
                             ('sc', SVC(C=0.1, kernel='linear'))])

In [35]:
for clf in (log_clf, rdc_clf,svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, ':', accuracy_score(y_pred, y_test))

LogisticRegression : 0.8044692737430168
RandomForestClassifier : 0.7988826815642458
SVC : 0.8100558659217877
VotingClassifier : 0.7988826815642458


In [61]:
# Bagging and Pasting

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(),n_estimators=500, max_samples=0.7,
        bootstrap=True, n_jobs = -1, oob_score=True) # bootstrap should be False for Pasting
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(bag_clf.oob_score_)
print(accuracy_score(y_pred, y_test))

0.95
1.0


In [63]:
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, 
                                 max_leaf_nodes=16,oob_score=True)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
print(rnd_clf.oob_score_)
print(accuracy_score(y_pred, y_test))

0.975
1.0


In [64]:
# Exercise
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

In [70]:
X = mnist['data']
y = mnist['target']

In [77]:
from sklearn.model_selection import train_test_split

In [78]:
X_train, X_test, X_val, y_train, y_test, y_val = X[:50000], X[50000:60000], X[60000:], y[:50000], y[50000:60000], y[60000:]

In [79]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(50000, 784)
(10000, 784)
(10000, 784)
(50000,)
(10000,)
(10000,)


In [80]:
from sklearn.linear_model import SGDClassifier

In [83]:
log_clf = LogisticRegression()
sgd_clf = SGDClassifier()
svm_clf = SVC()
rnf_clf = RandomForestClassifier()

In [84]:
from sklearn.metrics import precision_score, recall_score
from time import time

In [91]:
def fit_predict_evaluate(model):
    start = time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    end = time()
    print(model.__class__.__name__,':','accuracy',accuracy_score(y_pred, y_val),':', 'time',end-start)

In [92]:
for model in (log_clf, sgd_clf, svm_clf, rnf_clf):
    fit_predict_evaluate(model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression : accuracy 0.9243 : time 17.87313413619995
SGDClassifier : accuracy 0.8875 : time 99.86070775985718
SVC : accuracy 0.9785 : time 328.19522619247437
RandomForestClassifier : accuracy 0.9686 : time 38.94805645942688


In [93]:
for model in (log_clf, sgd_clf, svm_clf, rnf_clf):
    pred = model.predict(X_test)
    print(model.__class__.__name__, accuracy_score(pred, y_test))

LogisticRegression 0.9272
SGDClassifier 0.8937
SVC 0.9802
RandomForestClassifier 0.973


In [94]:
voting_clf = VotingClassifier(
        estimators=[('lr',log_clf), ('sgd', sgd_clf), ('rf', rnf_clf)],
        voting='hard'
)

voting_clf.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('sgd', SGDClassifier()),
                             ('rf', RandomForestClassifier())])

In [95]:
vt_pred  = voting_clf.predict(X_val)
accuracy_score(vt_pred, y_val)

0.9394

In [96]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(X_test)

In [98]:
svm_clf = SVC()
svm_clf.fit(X_train_scaled, y_train)
svm_pred = svm_clf.predict(X_val_scaled)
print(accuracy_score(svm_pred, y_val))

0.9634
