In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

In [11]:
#Reading from CSV File and Storing in Dataframe
X = pd.read_csv('../../Dataset/x_train_all.csv')
y = pd.read_csv('../../Dataset/y_train_all.csv')

In [12]:
#Printing the shapes of the training
print("========================================")
print("Shape of X is  :", X.shape)
print("Shape of y is  :", y.shape)
print("========================================")

Shape of X is  : (9690, 2304)
Shape of y is  : (9690, 1)


In [13]:
#Splitting the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#Printing the shapes of the training
print("========================================")
print("=   Shape to train and test Dataset    =")
print("========================================")
print("= Shape of X_train is  :", X_train.shape)
print("= Shape of y_train is  :", y_train.shape)
print("= Shape of X_test is   :", X_test.shape)
print("= Shape of y_test is   :", y_test.shape)
print("========================================")

=   Shape to train and test Dataset    =
= Shape of X_train is  : (7752, 2304)
= Shape of y_train is  : (7752, 1)
= Shape of X_test is   : (1938, 2304)
= Shape of y_test is   : (1938, 1)


In [15]:
#oversampling the data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
X_resampled.shape, y_resampled.shape

((17930, 2304), (17930, 1))

In [16]:
#size of the data
print("========================================")
print("=====   Shape to train Dataset     =====")
print("========================================")
print("= Shape of X_train is  :", X_resampled.shape)
print("= Shape of y_train is  :", y_resampled.shape)
print("========================================")

=====   Shape to train Dataset     =====
= Shape of X_train is  : (17930, 2304)
= Shape of y_train is  : (17930, 1)


In [18]:
# with X_train
gmm_xt = GaussianMixture(n_components=10, random_state=42)
gmm_xt.fit(X_train)
labels_gmm_xt = gmm_xt.predict(X_train)

probs_xtrain = gmm_xt.predict_proba(X_train)

In [22]:
silhouette_avg_xt = silhouette_score(X_train, labels_gmm_xt)
print("The average silhouette_score with X_train is :", silhouette_avg_xt)

The average silhouette_score with X_train is : 0.14229382846914454


In [19]:
#with x_oversampled
gmm_xos = GaussianMixture(n_components=10, random_state=42)
gmm_xos.fit(X_resampled)
labels_gmm_xos = gmm_xos.predict(X_resampled)

probs_xos = gmm_xos.predict_proba(X_resampled)

In [23]:
silhouette_avg_xos = silhouette_score(X_resampled, labels_gmm_xos)
print("The average silhouette_score with X_resampled is :", silhouette_avg_xos)

The average silhouette_score with X_resampled is : 0.160883057347202


In [27]:
#Expectation-Maximization
# with X_train

cluster_assingment_xt = gmm_xt.predict(X_train)

silhouette_em_xt = silhouette_score(X_train, cluster_assingment_xt)

cluster_means_xt = gmm_xt.means_
cluster_covariances_xt = gmm_xt.covariances_

print("========================================")
print("=====   Expectation-Maximization   =====")
print("========================================")
print("= cluster_assingment_xt is  :", cluster_assingment_xt.shape)
print("= cluster_means_xt is       :", cluster_means_xt.shape)
print("= cluster_covariances_xt is :", cluster_covariances_xt.shape)
print("========================================")
print("The average silhouette_score with X_train is :", silhouette_em_xt)
print("========================================")

=====   Expectation-Maximization   =====
= cluster_assingment_xt is  : (7752,)
= cluster_means_xt is       : (10, 2304)
= cluster_covariances_xt is : (10, 2304, 2304)
The average silhouette_score with X_train is : 0.14229382846914454


In [28]:
# with X_resampled

cluster_assingment_xos = gmm_xos.predict(X_resampled)

silhouette_em_xos = silhouette_score(X_resampled, cluster_assingment_xos)

cluster_means_xos = gmm_xos.means_
cluster_covariances_xos = gmm_xos.covariances_

print("========================================")
print("=====   Expectation-Maximization   =====")
print("========================================")
print("= cluster_assingment_xos is  :", cluster_assingment_xos.shape)
print("= cluster_means_xos is       :", cluster_means_xos.shape)
print("= cluster_covariances_xos is :", cluster_covariances_xos.shape)
print("========================================")
print("The average silhouette_score with X_resampled is :", silhouette_em_xos)
print("========================================")


=====   Expectation-Maximization   =====
= cluster_assingment_xos is  : (17930,)
= cluster_means_xos is       : (10, 2304)
= cluster_covariances_xos is : (10, 2304, 2304)
The average silhouette_score with X_resampled is : 0.160883057347202
