In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
all_data = pd.read_csv("tweets_data.csv")

In [3]:
seed = 7

X = all_data.clean_text
y = all_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [4]:
x_train_row = X_train.copy()

In [5]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1000) # limit number of features to avoid overfitting
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [6]:
X_train.shape

(6090, 1000)

In [7]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [8]:
y_train = y_train.values
y_test = y_test.values

### Original NN

In [17]:
from time import time

In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np

layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.7438423645320197
 layer_1:  2 layer_2:  3  accuracy:  0.7561576354679803
 layer_1:  2 layer_2:  4  accuracy:  0.7536945812807881
 layer_1:  2 layer_2:  5  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  2  accuracy:  0.7495894909688013
 layer_1:  3 layer_2:  3  accuracy:  0.7504105090311987
 layer_1:  3 layer_2:  4  accuracy:  0.7520525451559934
 layer_1:  3 layer_2:  5  accuracy:  0.7504105090311987
 layer_1:  4 layer_2:  2  accuracy:  0.7413793103448276
 layer_1:  4 layer_2:  3  accuracy:  0.7610837438423645
 layer_1:  4 layer_2:  4  accuracy:  0.7282430213464697
 layer_1:  4 layer_2:  5  accuracy:  0.7487684729064039
 layer_1:  5 layer_2:  2  accuracy:  0.7536945812807881
 layer_1:  5 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  5 layer_2:  4  accuracy:  0.7553366174055829
 layer_1:  5 layer_2:  5  accuracy:  0.7389162561576355


In [46]:
# predict on the test data using the best parameters
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now0 = time()
clf.fit(X_train, y_train)
print("training time: ", time()-now0)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(X_train)))
now = time()
pr = clf.predict(X_test)
print("prediction time: ", time()-now)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  1.2049107551574707
Training accuracy:  0.8249589490968802
prediction time:  0.0017518997192382812
Testing accuracy:  0.783322390019698


In [41]:
X_train.shape

(6090, 1000)

### k-means clustering and Expectation Maximization on row data

In [22]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
%matplotlib inline
from sklearn.metrics import accuracy_score
from sklearn.decomposition import IncrementalPCA 
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics
import matplotlib.cm as cm
import seaborn as sns
sns.set_style('whitegrid')
from sklearn.decomposition import PCA

In [80]:
clusterer = KMeans(n_clusters=150, random_state=10)
now = time()
cluster_labels = clusterer.fit_predict(X_train)
time() - now

21.079972743988037

In [81]:
now = time()
cluster_labels_test = clusterer.predict(X_test)
time() - now

0.01359105110168457

In [82]:
from sklearn.preprocessing import OneHotEncoder

In [83]:
cluster_labels

array([ 1, 27, 92, ..., 34, 75, 34], dtype=int32)

In [86]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cluster_labels.reshape(-1, 1))
cl_l_onhot = enc.transform(cluster_labels.reshape(-1, 1)).toarray()

In [87]:
cl_l_test_onhot = enc.transform(cluster_labels_test.reshape(-1, 1)).toarray()

In [88]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, cl_l_onhot, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  3  accuracy:  0.7151067323481116
 layer_1:  2 layer_2:  4  accuracy:  0.7151067323481116
 layer_1:  2 layer_2:  5  accuracy:  0.7151067323481116
 layer_1:  3 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  3  accuracy:  0.7142857142857143
 layer_1:  3 layer_2:  4  accuracy:  0.7167487684729064
 layer_1:  3 layer_2:  5  accuracy:  0.7151067323481116
 layer_1:  4 layer_2:  2  accuracy:  0.7167487684729064
 layer_1:  4 layer_2:  3  accuracy:  0.7151067323481116
 layer_1:  4 layer_2:  4  accuracy:  0.7167487684729064
 layer_1:  4 layer_2:  5  accuracy:  0.7167487684729064
 layer_1:  5 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  5 layer_2:  3  accuracy:  0.7151067323481116
 layer_1:  5 layer_2:  4  accuracy:  0.7151067323481116
 layer_1:  5 layer_2:  5  accuracy:  0.7151067323481116


In [89]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(cl_l_onhot, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(cl_l_onhot)))
now2 = time()
pr = clf.predict(cl_l_test_onhot)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.14440107345581055
Training accuracy:  0.7307060755336617
prediction time:  0.0005950927734375
Testing accuracy:  0.7150361129349967


In [91]:
clusterer = GaussianMixture(n_components=147, covariance_type='spherical', random_state=10)
now = time()
cluster_labels = clusterer.fit_predict(X_train)
time() - now

3.593796968460083

In [92]:
now = time()
cluster_labels_test = clusterer.predict(X_test)
time() - now

0.017373085021972656

In [93]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cluster_labels.reshape(-1, 1))
cl_l_onhot = enc.transform(cluster_labels.reshape(-1, 1)).toarray()
cl_l_test_onhot = enc.transform(cluster_labels_test.reshape(-1, 1)).toarray()

In [94]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, cl_l_onhot, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.6921182266009852
 layer_1:  2 layer_2:  3  accuracy:  0.6904761904761905
 layer_1:  2 layer_2:  4  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  5  accuracy:  0.6904761904761905
 layer_1:  3 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  3  accuracy:  0.6904761904761905
 layer_1:  3 layer_2:  4  accuracy:  0.6904761904761905
 layer_1:  3 layer_2:  5  accuracy:  0.6904761904761905
 layer_1:  4 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  3  accuracy:  0.6912972085385879
 layer_1:  4 layer_2:  4  accuracy:  0.6929392446633826
 layer_1:  4 layer_2:  5  accuracy:  0.6912972085385879
 layer_1:  5 layer_2:  2  accuracy:  0.6912972085385879
 layer_1:  5 layer_2:  3  accuracy:  0.6912972085385879
 layer_1:  5 layer_2:  4  accuracy:  0.6921182266009852
 layer_1:  5 layer_2:  5  accuracy:  0.6904761904761905


In [95]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(cl_l_onhot, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(cl_l_onhot)))
now2 = time()
pr = clf.predict(cl_l_test_onhot)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.31490397453308105
Training accuracy:  0.7018062397372742
prediction time:  0.0006358623504638672
Testing accuracy:  0.6736703873933026


### PCA

In [23]:
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import kurtosis

In [115]:
pca = PCA(random_state=0)
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=0,
  svd_solver='auto', tol=0.0, whiten=False)

In [116]:
now = time()
pca = PCA(random_state=0)
pca.fit(X_train)
transformed_data = pca.transform(X_train)[:, :600]
time() - now

0.5621092319488525

In [117]:
now = time()
transformed_data_test = pca.transform(X_test)[:, :600]
time() - now

0.050328969955444336

In [36]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, transformed_data, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.7602627257799671
 layer_1:  2 layer_2:  3  accuracy:  0.7430213464696224
 layer_1:  2 layer_2:  4  accuracy:  0.7635467980295566
 layer_1:  2 layer_2:  5  accuracy:  0.7594417077175698
 layer_1:  3 layer_2:  2  accuracy:  0.7487684729064039
 layer_1:  3 layer_2:  3  accuracy:  0.7446633825944171
 layer_1:  3 layer_2:  4  accuracy:  0.7405582922824302
 layer_1:  3 layer_2:  5  accuracy:  0.7487684729064039
 layer_1:  4 layer_2:  2  accuracy:  0.7479474548440066
 layer_1:  4 layer_2:  3  accuracy:  0.7668308702791461
 layer_1:  4 layer_2:  4  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  5  accuracy:  0.7307060755336617
 layer_1:  5 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  5 layer_2:  3  accuracy:  0.7684729064039408
 layer_1:  5 layer_2:  4  accuracy:  0.7438423645320197
 layer_1:  5 layer_2:  5  accuracy:  0.7569786535303776


In [78]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(transformed_data, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(transformed_data)))
now2 = time()
pr = clf.predict(transformed_data_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  5.614315748214722
Training accuracy:  0.870607553366174
prediction time:  0.0056989192962646484
Testing accuracy:  0.7636244254760342


In [47]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,2), random_state=1) 
now1 = time()
clf.fit(transformed_data, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(transformed_data)))
now2 = time()
pr = clf.predict(transformed_data_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  5.832362651824951
Training accuracy:  0.8303776683087027
prediction time:  0.003262042999267578
Testing accuracy:  0.7760998030203545


In [118]:
clusterer = KMeans(n_clusters=140, random_state=10)
now = time()
cluster_labels = clusterer.fit_predict(transformed_data)
time() - now

12.151318073272705

In [119]:
now = time()
cluster_labels_test = clusterer.predict(transformed_data_test)
time() - now

0.014474868774414062

In [120]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cluster_labels.reshape(-1, 1))
cl_l_onhot = enc.transform(cluster_labels.reshape(-1, 1)).toarray()
cl_l_test_onhot = enc.transform(cluster_labels_test.reshape(-1, 1)).toarray()

In [121]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, cl_l_onhot, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.7093596059113301
 layer_1:  2 layer_2:  3  accuracy:  0.7192118226600985
 layer_1:  2 layer_2:  4  accuracy:  0.7167487684729064
 layer_1:  2 layer_2:  5  accuracy:  0.7192118226600985
 layer_1:  3 layer_2:  2  accuracy:  0.7175697865353038
 layer_1:  3 layer_2:  3  accuracy:  0.7142857142857143
 layer_1:  3 layer_2:  4  accuracy:  0.7126436781609196
 layer_1:  3 layer_2:  5  accuracy:  0.7118226600985221
 layer_1:  4 layer_2:  2  accuracy:  0.7126436781609196
 layer_1:  4 layer_2:  3  accuracy:  0.7142857142857143
 layer_1:  4 layer_2:  4  accuracy:  0.7151067323481116
 layer_1:  4 layer_2:  5  accuracy:  0.7192118226600985
 layer_1:  5 layer_2:  2  accuracy:  0.7175697865353038
 layer_1:  5 layer_2:  3  accuracy:  0.7167487684729064
 layer_1:  5 layer_2:  4  accuracy:  0.7118226600985221
 layer_1:  5 layer_2:  5  accuracy:  0.7200328407224958


In [122]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(cl_l_onhot, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(cl_l_onhot)))
now2 = time()
pr = clf.predict(cl_l_test_onhot)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.23170089721679688
Training accuracy:  0.725615763546798
prediction time:  0.0005669593811035156
Testing accuracy:  0.716349310571241


In [128]:
clusterer = GaussianMixture(n_components=160, covariance_type='spherical', random_state=10)
now = time()
cluster_labels = clusterer.fit_predict(transformed_data)
time() - now

4.8777830600738525

In [129]:
now = time()
cluster_labels_test = clusterer.predict(transformed_data_test)
time() - now

0.014285802841186523

In [130]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cluster_labels.reshape(-1, 1))
cl_l_onhot = enc.transform(cluster_labels.reshape(-1, 1)).toarray()
cl_l_test_onhot = enc.transform(cluster_labels_test.reshape(-1, 1)).toarray()

In [126]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, cl_l_onhot, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.6970443349753694
 layer_1:  2 layer_2:  3  accuracy:  0.7019704433497537
 layer_1:  2 layer_2:  4  accuracy:  0.7060755336617406
 layer_1:  2 layer_2:  5  accuracy:  0.7060755336617406
 layer_1:  3 layer_2:  2  accuracy:  0.7060755336617406
 layer_1:  3 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  4  accuracy:  0.6995073891625616
 layer_1:  3 layer_2:  5  accuracy:  0.7036124794745484
 layer_1:  4 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  3  accuracy:  0.6995073891625616
 layer_1:  4 layer_2:  4  accuracy:  0.7019704433497537
 layer_1:  4 layer_2:  5  accuracy:  0.7077175697865353
 layer_1:  5 layer_2:  2  accuracy:  0.6962233169129721
 layer_1:  5 layer_2:  3  accuracy:  0.7060755336617406
 layer_1:  5 layer_2:  4  accuracy:  0.5665024630541872
 layer_1:  5 layer_2:  5  accuracy:  0.7060755336617406


In [131]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(cl_l_onhot, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(cl_l_onhot)))
now2 = time()
pr = clf.predict(cl_l_test_onhot)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.2621641159057617
Training accuracy:  0.7229885057471265
prediction time:  0.0006170272827148438
Testing accuracy:  0.7019041365725541


### ICA

In [96]:
ica = FastICA(random_state=0,n_components=7,  tol=1.0, max_iter=10000)
now = time()
ica.fit(X_train)
td = ica.transform(X_train)
print(time()-now)

0.609138011932373


In [97]:
now = time()
td_test = ica.transform(X_test)
print(time()-now)

0.004333972930908203


In [50]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(td, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(td)))
now2 = time()
pr = clf.predict(td_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.010246992111206055
Training accuracy:  0.5665024630541872
prediction time:  0.0008528232574462891
Testing accuracy:  0.5856861457649376


In [51]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, td, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.6379310344827587
 layer_1:  2 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  4  accuracy:  0.6535303776683087
 layer_1:  2 layer_2:  5  accuracy:  0.6395730706075534
 layer_1:  3 layer_2:  2  accuracy:  0.6223316912972086
 layer_1:  3 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  4  accuracy:  0.6379310344827587
 layer_1:  3 layer_2:  5  accuracy:  0.638752052545156
 layer_1:  4 layer_2:  2  accuracy:  0.6535303776683087
 layer_1:  4 layer_2:  3  accuracy:  0.6543513957307061
 layer_1:  4 layer_2:  4  accuracy:  0.6379310344827587
 layer_1:  4 layer_2:  5  accuracy:  0.6568144499178982
 layer_1:  5 layer_2:  2  accuracy:  0.6444991789819376
 layer_1:  5 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  5 layer_2:  4  accuracy:  0.6559934318555009
 layer_1:  5 layer_2:  5  accuracy:  0.6371100164203612


In [52]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,4), random_state=1) 
now1 = time()
clf.fit(td, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(td)))
now2 = time()
pr = clf.predict(td_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.16428184509277344
Training accuracy:  0.6482758620689655
prediction time:  0.0006070137023925781
Testing accuracy:  0.6506894287590282


In [98]:
clusterer = KMeans(n_clusters=3, random_state=10)
now = time()
cluster_labels = clusterer.fit_predict(td)
time() - now

0.0600428581237793

In [99]:
now = time()
cluster_labels_test = clusterer.predict(td_test)
time() - now

0.0012378692626953125

In [100]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cluster_labels.reshape(-1, 1))
cl_l_onhot = enc.transform(cluster_labels.reshape(-1, 1)).toarray()
cl_l_test_onhot = enc.transform(cluster_labels_test.reshape(-1, 1)).toarray()

In [107]:
layer_1 = [2, 3, 4, 5, 10, 50]
layer_2 = [2, 3, 4, 5, 10, 50]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, cl_l_onhot, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  4  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  5  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  10  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  50  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  4  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  5  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  10  accuracy:  0.5665024630541872
 layer_1:  3 layer_2:  50  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  4  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  5  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  10  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  50  accuracy:  0.566

In [108]:
clusterer = GaussianMixture(n_components=20, covariance_type='spherical', random_state=10)
now = time()
cluster_labels = clusterer.fit_predict(td)
time() - now

0.09441614151000977

In [109]:
now = time()
cluster_labels_test = clusterer.predict(td_test)
time() - now

0.0013310909271240234

In [110]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(cluster_labels.reshape(-1, 1))
cl_l_onhot = enc.transform(cluster_labels.reshape(-1, 1)).toarray()
cl_l_test_onhot = enc.transform(cluster_labels_test.reshape(-1, 1)).toarray()

In [111]:
layer_1 = [2, 3, 4, 5, 10, 50]
layer_2 = [2, 3, 4, 5, 10, 50]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, cl_l_onhot, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  4  accuracy:  0.6428571428571429
 layer_1:  2 layer_2:  5  accuracy:  0.645320197044335
 layer_1:  2 layer_2:  10  accuracy:  0.645320197044335
 layer_1:  2 layer_2:  50  accuracy:  0.6428571428571429
 layer_1:  3 layer_2:  2  accuracy:  0.6428571428571429
 layer_1:  3 layer_2:  3  accuracy:  0.6428571428571429
 layer_1:  3 layer_2:  4  accuracy:  0.59688013136289
 layer_1:  3 layer_2:  5  accuracy:  0.645320197044335
 layer_1:  3 layer_2:  10  accuracy:  0.645320197044335
 layer_1:  3 layer_2:  50  accuracy:  0.6042692939244664
 layer_1:  4 layer_2:  2  accuracy:  0.645320197044335
 layer_1:  4 layer_2:  3  accuracy:  0.645320197044335
 layer_1:  4 layer_2:  4  accuracy:  0.645320197044335
 layer_1:  4 layer_2:  5  accuracy:  0.645320197044335
 layer_1:  4 layer_2:  10  accuracy:  0.6428571428571429
 layer_1:  4 layer_2:  50  accuracy:  0.6453201970443

In [114]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,4), random_state=1) 
now1 = time()
clf.fit(cl_l_onhot, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(cl_l_onhot)))
now2 = time()
pr = clf.predict(cl_l_test_onhot)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.1144707202911377
Training accuracy:  0.6435139573070607
prediction time:  0.0005688667297363281
Testing accuracy:  0.6362442547603414


### Randomized Projections

In [53]:
from sklearn.random_projection import SparseRandomProjection
from scipy.linalg import pinv

In [60]:
srp = SparseRandomProjection(random_state=0, eps=0.3)
now = time()
transformed_data = srp.fit_transform(X_train)
time()-now

0.18739724159240723

In [61]:
now = time()
transformed_data_test = srp.transform(X_test)
time()-now

0.0325319766998291

In [62]:
transformed_data.shape

(6090, 968)

In [63]:
transformed_data_test.shape

(1523, 968)

In [64]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(transformed_data, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(transformed_data)))
now2 = time()
pr = clf.predict(transformed_data_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  1.0942189693450928
Training accuracy:  0.8458128078817734
prediction time:  0.0019609928131103516
Testing accuracy:  0.7826657912015759


In [65]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, transformed_data, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  3  accuracy:  0.7463054187192119
 layer_1:  2 layer_2:  4  accuracy:  0.7471264367816092
 layer_1:  2 layer_2:  5  accuracy:  0.7536945812807881
 layer_1:  3 layer_2:  2  accuracy:  0.7446633825944171
 layer_1:  3 layer_2:  3  accuracy:  0.7454844006568144
 layer_1:  3 layer_2:  4  accuracy:  0.7413793103448276
 layer_1:  3 layer_2:  5  accuracy:  0.7446633825944171
 layer_1:  4 layer_2:  2  accuracy:  0.7463054187192119
 layer_1:  4 layer_2:  3  accuracy:  0.7463054187192119
 layer_1:  4 layer_2:  4  accuracy:  0.7684729064039408
 layer_1:  4 layer_2:  5  accuracy:  0.7463054187192119
 layer_1:  5 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  5 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  5 layer_2:  4  accuracy:  0.7495894909688013
 layer_1:  5 layer_2:  5  accuracy:  0.7446633825944171


In [66]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(4,4), random_state=1) 
now1 = time()
clf.fit(transformed_data, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(transformed_data)))
now2 = time()
pr = clf.predict(transformed_data_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  1.0972130298614502
Training accuracy:  0.7998357963875206
prediction time:  0.0016188621520996094
Testing accuracy:  0.7721602101116218


### L1-based feature selection)

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

In [68]:
m = LogisticRegression(random_state=0, C=1.0).fit(X_train, y_train)
now = time()
model = SelectFromModel(m, prefit=True)
transformed_data = model.transform(X_train)
time() - now

0.039406776428222656

In [69]:
now = time()
transformed_data_test = model.transform(X_test)
time() - now

0.009242773056030273

In [70]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(2,3), random_state=1) 
now1 = time()
clf.fit(transformed_data, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(transformed_data)))
now2 = time()
pr = clf.predict(transformed_data_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.022879838943481445
Training accuracy:  0.5665024630541872
prediction time:  0.0014598369598388672
Testing accuracy:  0.5856861457649376


In [71]:
layer_1 = [2, 3, 4, 5]
layer_2 = [2, 3, 4, 5]

for i in layer_1:
    for j in layer_2:
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(i, j), random_state=1) 
        scores = cross_val_score(clf, transformed_data, y_train, cv=5, scoring='accuracy')
        print(" layer_1: ", i, "layer_2: ", j,  " accuracy: ", np.median(scores))

 layer_1:  2 layer_2:  2  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  2 layer_2:  4  accuracy:  0.7857142857142857
 layer_1:  2 layer_2:  5  accuracy:  0.7865353037766831
 layer_1:  3 layer_2:  2  accuracy:  0.7881773399014779
 layer_1:  3 layer_2:  3  accuracy:  0.7816091954022989
 layer_1:  3 layer_2:  4  accuracy:  0.7931034482758621
 layer_1:  3 layer_2:  5  accuracy:  0.7931034482758621
 layer_1:  4 layer_2:  2  accuracy:  0.7865353037766831
 layer_1:  4 layer_2:  3  accuracy:  0.5665024630541872
 layer_1:  4 layer_2:  4  accuracy:  0.7848932676518884
 layer_1:  4 layer_2:  5  accuracy:  0.7922824302134647
 layer_1:  5 layer_2:  2  accuracy:  0.7857142857142857
 layer_1:  5 layer_2:  3  accuracy:  0.7873563218390804
 layer_1:  5 layer_2:  4  accuracy:  0.7963875205254516
 layer_1:  5 layer_2:  5  accuracy:  0.7848932676518884


In [75]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,4), random_state=1) 
now1 = time()
clf.fit(transformed_data, y_train)
print("training time: ", time()-now1)
print("Training accuracy: ", accuracy_score(y_train, clf.predict(transformed_data)))
now2 = time()
pr = clf.predict(transformed_data_test)
print("prediction time: ", time()-now2)
print("Testing accuracy: ", accuracy_score(y_test, pr))

training time:  0.5543522834777832
Training accuracy:  0.8313628899835797
prediction time:  0.0007219314575195312
Testing accuracy:  0.7721602101116218
