In [53]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [16]:
# meanings will give us several hundreds of variables and each cluster will have a set of data points (each data point containing values of each of these variables)

# Creating 20 data points and 5 variables for each data point (between 10 and 50)
#example_data = np.random.randint(10,50,100).reshape(20,5)
#example_data[0:10:]

# trying on a real data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']
dataset = pd.read_csv(url, names=names)

dataset.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [17]:
# dividing the dataset into a feature set and corresponding labels
features_X = dataset.drop('Class', 1)
labels_Y = dataset['Class']

In [None]:
# Perform PCA on all data

# Scaling features such that they all have a mean of 0 and a variance of 1
scaler = StandardScaler()
scaled_data = scaler.fit_transform(features_X)

pca = PCA() # can be replaced with "PCA(n_components=2)" but need to check variance ratio first
pca_data = pca.fit_transform(scaled_data)

pca.explained_variance_ratio_ # observing how much each PCA is responsible for the variance

In [None]:
classifier = DecisionTreeClassifier()
classifier.fit(pca_data, labels_Y) # train classifier

# assuming we got new data
newdata = []

# just transforming to pca, no re-fit again needed
scaled_new_data = scaler.transform(newdata)
pca_new_data = pca.transform(scaled_new_data)

pred_labels = classifier.predict_proba(pca_new_data)


In [21]:
"""
Keeping it just in case
"""


# Splitting the dataset into the training set and test set such that it can be used for classification
X_train, X_test, y_train, y_test = train_test_split(features_X, labels_Y, test_size=0.2, random_state=0)

# Scaling features such that they all have a mean of 0 and a variance of 1
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

pca = PCA() # can be replaced with "PCA(n_components=2)" if data is too much
pca_X_train = pca.fit_transform(scaled_X_train)
pca_X_test = pca.transform(scaled_X_test)

pca.explained_variance_ratio_ # observing how much each PCA is responsible for the variance

# Training, Making Predictions and Performance Evaluation
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(pca_X_train, y_train)

# Predicting the test set results and making performance evaluation
y_pred = classifier.predict(pca_X_test)

cm = confusion_matrix(y_test, y_pred)
print("Accuracy:")
print(accuracy_score(y_test, y_pred))
