# IMPORTS

In [None]:
# NECESSARY IMPORTS
import numpy, scipy, pandas as pd
from scipy.io.arff import loadarff
import matplotlib.pyplot as plt
import numpy as np
import random

# FOR DIMENSION REDUCTION, CLASSIFICATION MODEL AND CLUSTERING
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

from sklearn.cluster import KMeans


# FOR MAKING NETWORK AND OTHER STUFFS
import networkx as nx


# LOADING TRAINING DATASET AND TEST DATASET

In [None]:
e_train_f = "DataSets/Earthquakes/Earthquakes_TRAIN.arff" # PATH TO TRAINING DATASET
e_test_f = "DataSets/Earthquakes/Earthquakes_TEST.arff" # PATH TO TESTING DATASET

In [None]:
e_train, e_train_meta = loadarff(e_train_f)
df = pd.DataFrame(e_train)

# SEPERATING THE DATA AND CLASS LABEL
dataa = df.iloc[:, :-1]
tagg = df.iloc[:, -1:] # IN THE arff FILE, THE LAST COLUMN IS THE CLASS LABEL


In [None]:
e_test, e_test_meta = loadarff(e_test_f)
df_test = pd.DataFrame(e_test)

# SEPERATING THE DATA AND CLASS LABEL
dataa_test = df_test.iloc[:, :-1]
tagg_test = df_test.iloc[:, -1:] # IN THE arff FILE, THE LAST COLUMN IS THE CLASS LABEL


# PRE-PROCESSING

In [None]:
tagg = tagg.replace(b'1', 1)
tagg = tagg.replace(b'0', 0)

In [None]:
tagg_test = tagg_test.replace(b'1', 1)
tagg_test = tagg_test.replace(b'0', 0)

In [None]:
x_train = np.array(dataa)
y_train = np.array(tagg)

x_test = np.array(dataa_test)
y_test = np.array(tagg_test)

In [None]:
# # STANDARDARIZING DATA # THIS CAUSES PROBLEM IN MAKING NETWORK
# sc = StandardScaler()
 
# x_train = sc.fit_transform(x_train)
# x_test = sc.transform(x_test)

In [None]:
pca = PCA(n_components=100)
pca.fit(x_train)
X_pca = pca.fit_transform(x_train)
X_pca_test = pca.transform(x_test)

In [None]:
X_ica = X_pca
X_ica_test = X_pca_test

In [None]:
# TAKING 6 COMPONENTS

ica = FastICA(n_components=6)
X_ica = ica.fit_transform(x_train)
X_ica_test = ica.transform(x_test)

# X_ica = ica.fit_transform(X_pca)
# X_ica_test = ica.transform(X_pca_test)

# APPLYING DIFFERENT CLASSIFIERS

### APPLYING RANDOM FOREST CLASSIFIER

In [None]:
r_forest = RandomForestClassifier(max_depth=5, random_state = 0)
r_forest.fit(X_ica,y_train)
r_predicted = r_forest.predict(X_ica_test)
score = r_forest.score(X_ica_test, y_test)
rf_score_ = np.mean(score)

print('Accuracy : %.3f' % (rf_score_*100))

In [None]:
print(confusion_matrix(y_test, r_predicted))
print(classification_report(y_test, r_predicted))

### APPLYING GRADIENT BOOSING CLASSIFIER

In [None]:
g_boost = GradientBoostingClassifier(random_state=3)
g_boost.fit(X_ica, y_train)
predicted = g_boost.predict(X_ica_test)
g_score = g_boost.score(X_ica_test, y_test)
g_boost_score = np.mean(g_score)

print('Accuracy : %.3f' % (g_boost_score*100))


In [None]:
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

### APPLYING LOGISTIC REGRESSION

In [None]:
log_reg = LogisticRegression(random_state=3)
log_reg.fit(X_ica, y_train)
lg_predicted = log_reg.predict(X_ica_test)
lg_score = log_reg.score(X_ica_test, y_test)
lg_score_ = np.mean(lg_score)

print('Accuracy : %.3f' % (lg_score_*100))


In [None]:
print(confusion_matrix(y_test, lg_predicted))
print(classification_report(y_test, lg_predicted))

### APPLYING GAUSSION PROCSS REGRESSOR

In [None]:
gp = GaussianProcessRegressor(random_state=2)
gp.fit(X_ica, y_train)
gp_score = gp.score(X_ica_test, y_test)
gp_predicted = gp.predict(X_ica_test)
gp_score_ = np.mean(gp_score)
print('Accuracy : %.3f' % (gp_score_*100))

### APPLING DECISION TREE CLASSIFIER

In [None]:
dt = DecisionTreeClassifier(random_state=6)
dt.fit(X_ica, y_train)
dt_score = dt.score(X_ica_test, y_test)
predicted = dt.predict(X_ica_test)
dt_score_ = np.mean(dt_score)

print('Accuracy : %.3f' % (dt_score_*100))

In [None]:
print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted))

### APPLYING SVC

In [None]:
svc_model = SVC(C=.3, kernel='poly', gamma=1)
svc_model.fit(X_ica, y_train)
 
prediction = svc_model.predict(X_ica_test)
svc_score = svc_model.score(X_ica_test, y_test)
svc_score_ = np.mean(svc_score)

print('Accuracy : %.3f' % (svc_score_*100))


In [None]:
print(confusion_matrix(y_test, prediction))
print(classification_report(y_test, prediction))

### APPLYING ADABOOST CLASSIFIER

In [None]:
abc = AdaBoostClassifier(n_estimators=200, learning_rate=0.2, random_state=0)
model = abc.fit(X_ica, y_train)

y_pred = model.predict(X_ica_test)

ada_score_ = metrics.accuracy_score(y_test, y_pred)
print('Accuracy : %.3f' % (ada_score_*100))


In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### APPLYING KNN CLASSIFIER

In [None]:
 
knn = KNeighborsClassifier(n_neighbors=4)
  
knn.fit(X_ica, y_train)
  
# Predict on dataset which model has not seen before
knn_p = knn.predict(X_ica_test)

knn_score_ = np.mean(knn.score(X_ica_test, y_test))

print('Accuracy : %.3f' % (knn_score_*100))


In [None]:
print(confusion_matrix(y_test, knn_p))
print(classification_report(y_test, knn_p))

### COMPARING CLASSIFICATION METHODS

In [None]:
scores = [rf_score_, g_boost_score, lg_score_, dt_score_, svc_score_, ada_score_, knn_score_ ]
strr = "rf_score_, g_boost_score, lg_score_, dt_score_, svc_score_, ada_acore_, knn_score_"
x_ax = [i for i in strr.split(",")]
x = [i for i in range(len(x_ax))]

scores = np.array(scores)*100

plt.figure(figsize=(10, 10))
# plt.ylim([-1, 100])
plt.xticks(x, x_ax)
plt.plot(x, scores)
plt.scatter(x, scores)
for i, txt in enumerate(x_ax):
    plt.annotate(txt, (x[i]-0.2 ,scores[i]+0.1))
plt.grid()

In [None]:
plt.figure(figsize=(10, 10))

plt.bar(x_ax, scores)

for i, txt in enumerate(scores):
    plt.annotate(f"{txt:.2f}%", (x[i]-0.25 ,scores[i]+0.5))

# NETWORK

In [None]:
mergg_0 = np.array([]) 
mergg_0 = np.append(mergg_0, X_ica[:, 0])
mergg_0 = np.append(mergg_0, X_ica_test[:, 0])

mergg_1 = np.array([])
mergg_1 = np.append(mergg_1, X_ica[:, 1])
mergg_1 = np.append(mergg_1, X_ica_test[:, 1])

mergg = np.array(list(zip(mergg_0, mergg_1)))

In [None]:
points_for_network = []
points_for_network = mergg
# points_for_network[:] = X_ica[:, 0:2] 
# points_for_network[:] = list(points_for_network_2)

In [None]:
def get_distance(a, b, dis_type = 0):
    '''
    0 FOR MANHATTON DISTANCE
    1 FOR EUCLEDIAN DISTANC
    '''
    summ = 0
    diff = 0 
    summs = []
    diffs = []

    for i in range(len(a)):
        diffs.append(np.abs(a[i] - b[i]))
    
    if dis_type:
        return sum([i**2 for i in diffs])**0.5 # EUCLEDIAN DISTANCE 
    return sum(diffs) # MANHATTEN DISTANCE 

In [None]:
distt = []
for i, pointt in enumerate(points_for_network[:-1]):
    for j, pointt_o in enumerate(points_for_network[i+1:]):
        distt.append(get_distance(pointt_o, pointt))

In [None]:
# CHECKING
np.mean(distt), max(distt), min(distt), np.std(distt) 

In [None]:
dec = list(sorted(distt, reverse=True))
difff = []
for i in range(len(dec)-1):
    difff.append(dec[i] - dec[i+1])
# print(sorted(difff, reverse= True)[:10])

In [None]:
G = nx.Graph()

# ADDING NODES
G.add_nodes_from([tuple(i,) for i in points_for_network])
nodess = [i for i in G.nodes]

In [None]:
meann = np.mean(distt)
_maxx = max(distt)
for i in range(len(nodess)):
    for j in range(len(nodess)):
        _d = get_distance(nodess[i], nodess[j])
        if i == j :
            continue
        # if _d >= _maxx :
        #     continue
        elif _d < meann:
            G.add_edge(nodess[i], nodess[j])

In [None]:
plt.figure(figsize=(10, 9))
# nx.draw_networkx_edges(G, pos = pos, alpha=0.1)

# IF WE STANDARIZE THE DATA, WE SHALL SEE 1 ISOLATED NODE.
nx.draw(G, width = 0.5, node_size = 25, edge_color = "black", node_color = "red", label = True)
plt.show()

In [None]:
net_diag = nx.diameter(G)

In [None]:
nodes_degree_in_graph = [i for i in G.degree]
# print(len(nodes_degree_in_graph))

In [None]:
bar_x = list(set([i[1] for i in nodes_degree_in_graph]))
# print(bar_x)
plt.hist(bar_x, bins = 30)
plt.show()
net_avg_deg = np.average(bar_x)


In [None]:
net_avg_cluster = np.average([i[1] for i in list(dict(nx.clustering(G)).items())]) * 100


In [None]:
print(f"In the network, it has {len(G.nodes)} nodes and it has {len(G.edges)} edges")
print(f"Diameter of network : {net_diag}")
print(f"Average degree : {net_avg_deg}")
print(f"Average clustering : {net_avg_cluster}%")

In [None]:
kmeans = KMeans(n_clusters=2, random_state=4) 

kmeans.fit(mergg)
# kmeans.fit(x_train)

# kmeans_predict = kmeans.predict(x_test)
kmeans_predict = kmeans.predict(X_ica[:, 0:2])
kmeans_predict_test = kmeans.predict(X_ica_test[:, 0:2])


#y = Y_train.tolist()
#labell = labels.tolist()
labell = []
labell[:] = [i for i in kmeans.labels_]

In [None]:
len(kmeans.labels_)

In [None]:
y = [i for i in tagg.target]
yy_test = [i for i in tagg_test.target]

In [None]:
correct_labels = 0
for i in range(len(y)):
        if(y[i] == labell[i]):
                correct_labels += 1

correct_labels_test = 0
for i in range(len(yy_test)):
        if(yy_test[i] == labell[i+len(y)]):
                correct_labels_test += 1



print("Result: %d out of %d samples were correctly labeled." % (correct_labels + correct_labels_test, tagg.size+tagg_test.size))
print('Accuracy score: {0:0.2f}'.format((correct_labels+correct_labels_test)/(float(len(y)+len(yy_test)))))

In [None]:
u_labels = np.unique(tagg)

#plotting the results:
plt.scatter(mergg[:, 0], mergg[:, 1], c = "blue", s = 50, label = "original", alpha=0.7)
# plt.scatter(x_test[:, 0], x_test[:, 1], c = "blue", s = 100, label = "original", alpha=0.8)

plt.show()

for i in u_labels:
    plt.scatter(X_ica[:, 0], X_ica[:, 1], c = kmeans_predict, label = i)
    plt.scatter(X_ica_test[:, 0], X_ica_test[:, 1], c = kmeans_predict_test, label = i)
    # plt.scatter(x_test[:, 0], x_test[:, 1], c = kmeans_predict, label = i)

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)


plt.legend()
plt.show()