In [1]:
import numpy as np
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],voting='hard')

In [4]:
voting_clf.fit(X_train, y_train)

In [5]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.904
SVC 0.896
VotingClassifier 0.904


In [7]:
#Building an Ensemble Classifier 
iris = datasets.load_iris()

In [25]:
irisX = iris.data[:, [0, 1]]
irisy = iris.target
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(irisX, irisy, random_state=42)

In [26]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],voting='hard')

In [27]:
voting_clf.fit(X_iris_train, y_iris_train)

In [28]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_iris_train, y_iris_train)
    y_iris_pred = clf.predict(X_iris_test)
    print(clf.__class__.__name__, accuracy_score(y_iris_test, y_iris_pred))

LogisticRegression 0.8421052631578947
RandomForestClassifier 0.8157894736842105
SVC 0.8421052631578947
VotingClassifier 0.8421052631578947


In [32]:
# Now running again but with soft voting
svm_clf.probability=True

In [35]:
soft_voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],voting='soft')

In [36]:
soft_voting_clf.fit(X_iris_train, y_iris_train)

In [38]:
for clf in (log_clf, rnd_clf, svm_clf, soft_voting_clf):
    clf.fit(X_iris_train, y_iris_train)
    soft_y_iris_pred = clf.predict(X_iris_test)
    print(clf.__class__.__name__, accuracy_score(y_iris_test, soft_y_iris_pred))

LogisticRegression 0.8421052631578947
RandomForestClassifier 0.7894736842105263
SVC 0.8421052631578947
VotingClassifier 0.7894736842105263


In [43]:
#Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

bagy_pred = bag_clf.predict(X_test)

print(accuracy_score(y_test, bagy_pred))

0.904


In [42]:
#Bagging with Iris
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_iris_train, y_iris_train)

iris_bag_y_pred = bag_clf.predict(X_iris_test)

print(accuracy_score(y_iris_test, iris_bag_y_pred))

0.7368421052631579


In [120]:
#Out of bag Evaluation
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=25, max_samples=50,
    bootstrap=True, n_jobs=-1, oob_score=True, random_state=42)

bag_clf.fit(X_iris_train, y_iris_train)

bag_clf.oob_score_
# Score is marginally lower than accuracy score on the test set

0.6607142857142857

In [123]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_iris_train, y_iris_train)
y_iris_dtpred = dt_clf.predict(X_iris_test)
print(accuracy_score(y_iris_test, y_iris_dtpred))

0.6578947368421053


In [45]:
#Random Forests 
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

print(accuracy_score(y_test, y_pred_rf))

0.92


In [46]:
#Now with Iris
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

rnd_clf.fit(X_iris_train, y_iris_train)

y_pred_iris_rf = rnd_clf.predict(X_iris_test)

print(accuracy_score(y_iris_test, y_pred_iris_rf))
#Accuracy notably lower than the other scores from the various models, worth commenting on

0.7631578947368421


In [48]:
#Feature Importance
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)

rnd_clf.fit(irisX, irisy)

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.6457252651455682
sepal width (cm) 0.35427473485443184


In [49]:
#Now with the entire data set not just the restricted X and y already created
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)

rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)
#comparitively when using the whole dataset the two sepal measurements are far less important to the data as a whole.

sepal length (cm) 0.11249225099876375
sepal width (cm) 0.02311928828251033
petal length (cm) 0.4410304643639577
petal width (cm) 0.4233579963547682


In [51]:
#Boosting
#Part 1, AdaBoost
from sklearn.ensemble import AdaBoostClassifier

In [53]:
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200,
                            algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_iris_train, y_iris_train)

In [56]:
y_ada_pred = ada_clf.predict(X_iris_test)

In [57]:
accuracy_ada = accuracy_score(y_iris_test, y_ada_pred)
print("Accuracy:", accuracy_ada)

Accuracy: 0.7368421052631579


In [60]:
#Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

In [124]:
gbrt = GradientBoostingRegressor(max_depth=1, n_estimators=3, learning_rate=0.5)
gbrt.fit(X_iris_train, y_iris_train)

In [125]:
y_gbrt_pred = gbrt.predict(X_iris_test)

In [126]:
from sklearn.metrics import mean_squared_error
mse_gbrt = mean_squared_error(y_iris_test, y_gbrt_pred)
print("Mean Squared Error:", mse_gbrt)

Mean Squared Error: 0.26965714130477203


In [94]:
#Principal Components Analysis
from sklearn.decomposition import PCA
X, y = datasets.load_iris(return_X_y=True)
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [95]:
pca.explained_variance_ratio_
#Around 92% of thge variance lies upon the prinicpal component and the second around 5.3% this leaves around 2.5% for the third, meaning it probably carries little information 

array([0.92461872, 0.05306648])

In [127]:
pca_4 = PCA(n_components=0.5)


X4D_train = pca_4.fit_transform(X)
explained_four_variance_ratio = pca_4.explained_variance_ratio_

In [128]:
print("Explained variance ratio:", explained_four_variance_ratio)
#Now able to view the tiny PCs when expanding the number of components

Explained variance ratio: [0.92461872]


In [99]:
#Now preserve 95% variance
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [100]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

In [101]:
print("Explained Variance Ratios:")
print(cumsum)
print("\nNumber of Principal Components required for 95% variance preservation:", d)

Explained Variance Ratios:
[0.92461872 0.97768521 0.99478782 1.        ]

Number of Principal Components required for 95% variance preservation: 2


In [103]:
#One-Hot Encoding
import pandas as pd
df = pd.DataFrame({'no_legs': [2, 4, 6, 0],
                   'no_wings': [2, 0, 0, 0],
                   'species': ['bird', 'mammal', 'insect', 'fish'],
                   'cuteness_factor': [4, 7, 1, 2]})

In [104]:
zoo_df = df.sample(frac=2.5, replace=True, random_state=1)

In [105]:
animal_cat = zoo_df[["species"]]
animal_cat

Unnamed: 0,species
1,mammal
3,fish
0,bird
0,bird
3,fish
1,mammal
3,fish
1,mammal
3,fish
0,bird


In [106]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
animal_cat_encoded = ordinal_encoder.fit_transform(animal_cat)
animal_cat_encoded

array([[2.],
       [1.],
       [0.],
       [0.],
       [1.],
       [2.],
       [1.],
       [2.],
       [1.],
       [0.]])

In [107]:
animal_categories = ordinal_encoder.categories_

from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
animal_cat_1hot = cat_encoder.fit_transform(animal_cat)
animal_categories = ordinal_encoder.categories_
animal_cat_1hot

<10x3 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [108]:
animal_cat_1hot.toarray()

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [109]:
cat_encoder = OneHotEncoder(sparse=False)
animal_cat_1hot = cat_encoder.fit_transform(animal_cat)
animal_cat_1hot



array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [110]:
enc_data = pd.DataFrame(animal_cat_1hot)
enc_data.columns = animal_categories[0]
enc_data.index = zoo_df.index
enc_data  #Have a look at the data

Unnamed: 0,bird,fish,mammal
1,0.0,0.0,1.0
3,0.0,1.0,0.0
0,1.0,0.0,0.0
0,1.0,0.0,0.0
3,0.0,1.0,0.0
1,0.0,0.0,1.0
3,0.0,1.0,0.0
1,0.0,0.0,1.0
3,0.0,1.0,0.0
0,1.0,0.0,0.0


In [111]:
zoo_encoded_df = pd.concat([zoo_df, enc_data], axis =1, ignore_index = False)
zoo_encoded_df

Unnamed: 0,no_legs,no_wings,species,cuteness_factor,bird,fish,mammal
1,4,0,mammal,7,0.0,0.0,1.0
3,0,0,fish,2,0.0,1.0,0.0
0,2,2,bird,4,1.0,0.0,0.0
0,2,2,bird,4,1.0,0.0,0.0
3,0,0,fish,2,0.0,1.0,0.0
1,4,0,mammal,7,0.0,0.0,1.0
3,0,0,fish,2,0.0,1.0,0.0
1,4,0,mammal,7,0.0,0.0,1.0
3,0,0,fish,2,0.0,1.0,0.0
0,2,2,bird,4,1.0,0.0,0.0


In [112]:
zoo_encoded_df.drop(columns=['species'], inplace = True)

In [113]:
zoo_encoded_df.drop(columns=['mammal'], inplace = True)

In [114]:
zoo_encoded_df

Unnamed: 0,no_legs,no_wings,cuteness_factor,bird,fish
1,4,0,7,0.0,0.0
3,0,0,2,0.0,1.0
0,2,2,4,1.0,0.0
0,2,2,4,1.0,0.0
3,0,0,2,0.0,1.0
1,4,0,7,0.0,0.0
3,0,0,2,0.0,1.0
1,4,0,7,0.0,0.0
3,0,0,2,0.0,1.0
0,2,2,4,1.0,0.0


In [129]:
#Using pandas get dummies
import pandas as pd
df = pd.DataFrame({'no_legs': [2, 4, 6, 0],
                   'no_wings': [2, 0, 0, 0],
                   'species': ['bird', 'mammal', 'insect', 'fish'],
                   'cuteness_factor': [4, 7, 1, 2]})
zoo_df = df.sample(frac=2.5, replace=True, random_state=1)
animal_cat = zoo_df[["species"]]
pd.get_dummies(zoo_df[["species"]])

Unnamed: 0,species_bird,species_fish,species_mammal
1,0,0,1
3,0,1,0
0,1,0,0
0,1,0,0
3,0,1,0
1,0,0,1
3,0,1,0
1,0,0,1
3,0,1,0
0,1,0,0


In [130]:
pd.get_dummies(zoo_df[["species"]], drop_first=True)

Unnamed: 0,species_fish,species_mammal
1,0,1
3,1,0
0,0,0
0,0,0
3,1,0
1,0,1
3,1,0
1,0,1
3,1,0
0,0,0


In [131]:
pd.get_dummies(zoo_df, columns=["species"], drop_first=True)

Unnamed: 0,no_legs,no_wings,cuteness_factor,species_fish,species_mammal
1,4,0,7,0,1
3,0,0,2,1,0
0,2,2,4,0,0
0,2,2,4,0,0
3,0,0,2,1,0
1,4,0,7,0,1
3,0,0,2,1,0
1,4,0,7,0,1
3,0,0,2,1,0
0,2,2,4,0,0
