### Load Data

In [1]:
from src import vectorization
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.multioutput import MultiOutputClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score
import datetime

In [2]:
vectorizer = vectorization.Vectorizer('messages_info.db')

X_train, X_test, y_train, y_test = vectorizer.fit(workers=10)

100%|███████████████████████████████████████████| 31/31 [00:41<00:00,  1.35s/it]


### Visualization

In [3]:
X = np.vstack([X_train, X_test])
X.shape

(26216, 300)

In [4]:
X_reduced = PCA(n_components=50).fit_transform(X)
visual = TSNE(n_components=2,
              perplexity=X_reduced.shape[0]/y_train.shape[1],
              learning_rate=500,
              early_exaggeration=50,  ### Early exaggertaion or learning_rate are too high
              n_iter=1500,
              metric='cosine',
              verbose=2).fit_transform(X_reduced)

visual = pd.DataFrame(visual, columns=['x', 'y'])

fig, ax = plt.subplots(figsize=(10, 10))
sns.scatterplot(x='x', y='y', data=visual, ax=ax)

[t-SNE] Computing 2248 nearest neighbors...
[t-SNE] Indexed 26216 samples in 0.002s...
[t-SNE] Computed neighbors for 26216 samples in 26.230s...
[t-SNE] Computed conditional probabilities for sample 1000 / 26216
[t-SNE] Computed conditional probabilities for sample 2000 / 26216
[t-SNE] Computed conditional probabilities for sample 3000 / 26216
[t-SNE] Computed conditional probabilities for sample 4000 / 26216
[t-SNE] Computed conditional probabilities for sample 5000 / 26216
[t-SNE] Computed conditional probabilities for sample 6000 / 26216
[t-SNE] Computed conditional probabilities for sample 7000 / 26216
[t-SNE] Computed conditional probabilities for sample 8000 / 26216
[t-SNE] Computed conditional probabilities for sample 9000 / 26216
[t-SNE] Computed conditional probabilities for sample 10000 / 26216
[t-SNE] Computed conditional probabilities for sample 11000 / 26216
[t-SNE] Computed conditional probabilities for sample 12000 / 26216
[t-SNE] Computed conditional probabilities for 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\cagli\Anaconda3\envs\dashmote\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-7f6df7c684af>", line 8, in <module>
    verbose=2).fit_transform(X_reduced)
  File "C:\Users\cagli\Anaconda3\envs\dashmote\lib\site-packages\sklearn\manifold\t_sne.py", line 895, in fit_transform
    embedding = self._fit(X)
  File "C:\Users\cagli\Anaconda3\envs\dashmote\lib\site-packages\sklearn\manifold\t_sne.py", line 813, in _fit
    skip_num_points=skip_num_points)
  File "C:\Users\cagli\Anaconda3\envs\dashmote\lib\site-packages\sklearn\manifold\t_sne.py", line 864, in _tsne
    **opt_args)
  File "C:\Users\cagli\Anaconda3\envs\dashmote\lib\site-packages\sklearn\manifold\t_sne.py", line 357, in _gradient_descent
    inc = update * grad < 0.0
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most rece

KeyboardInterrupt: 

### Model Testing (Supervised)

In [None]:
labels = ['Logistic Regression', 'SVC', 'AdaBoost', 'GradientBoosting']

In [None]:
logreg = MultiOutputClassifier(LogisticRegression(solver='lbfgs', n_jobs=12))
logreg_start = datetime.datetime.now()
logreg.fit(X_train, y_train)
logreg_end = datetime.datetime.now()
y_hat_logreg = logreg.predict(X_test)

logreg_results = {
    'Training Time': str(logreg_end-logreg_start),
    'Average F1-Score': np.mean([f1_score(y_test.values[:, category], y_hat_logreg[:, category], average='weighted') for category in range(y_test.shape[1])])
}
logreg_results

In [None]:
svc = MultiOutputClassifier(SVC(gamma='scale'))
svc_start = datetime.datetime.now()
svc.fit(X_train, y_train)
svc_end = datetime.datetime.now()
y_hat_svc = svc.predict(X_test)

svc_results = {
    'Training Time': str(svc_end-svc_start),
    'Average F1-Score': np.mean([f1_score(y_test.values[:, category], y_hat_svc[:, category], average='weighted') for category in range(y_test.shape[1])])
}
svc_results

In [None]:
adaboost = MultiOutputClassifier(AdaBoostClassifer())
adaboost_start = datetime.datetime.now()
adaboost.fit(X_train, y_train)
adaboost_end = datetime.datetime.now()
y_hat_adaboost = adaboost.predict(X_test)

adaboost_results = {
    'Training Time': str(adaboost_end-adaboost_start),
    'Average F1-score': np.mean([f1_score(y_test.values[:, category], y_hat_adaboost[:, category], average='weighted') for category in range(y_test.shape[1])])
}
adaboost_results

In [None]:
gradboost = MultiOutputClassifier(GradientBoostingClassifier())
gradboost_start = datetime.datetime.now()
gradboost.fit(X_train, y_train)
gradboost_end = datetime.datetime.now()
y_hat_gradboost = gradboost.predict(X_train, y_train)

gradboost_results = {
    'Training Time': str(gradboost_end-gradboost_start),
    'Average F1-Score': np.mean([f1_score(y_test.values[:, category], y_hat_gradboost[:, category], average='weighted') for category in range(y_test.shape[1])])
}
gradboost_results

### Model Testing (Unsupervised)

Test here:
- KMeans
- GMM 

With one cluster for each category in y_train

Also test OPTICS