**Note:** the previous baseline model is using countvecterizor to encode the text data, which result in an embedding with dimension of the the size of the whole vocabulary. With the large size of corpus and number of unique words, the model is very hard to train, thus word2vec is used instead in the baseline model to give a more realisic result.

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import feature_extraction,feature_selection

import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.callbacks import History

from wordcloud import WordCloud, STOPWORDS

### read data

In [None]:
df = pd.read_csv('/content/drive/My Drive/data/dataset.csv', delimiter = ',')

In [None]:
vectorizer_count = feature_extraction.text.CountVectorizer()
vectorizer_tfidf = feature_extraction.text.TfidfVectorizer()

In [None]:
vectorizer_count.fit(df['cleaned_text'])
X_train_c = vectorizer_count.transform(df['cleaned_text'])
dic_vocabulary_c = vectorizer_count.vocabulary_
len(dic_vocabulary_c)

104456

In [None]:
sns.heatmap(X_train_c.todense()[:,np.random.randint(0,X.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')

In [None]:
vectorizer_tfidf.fit(df['cleaned_text'])
X_train_t = vectorizer_tfidf.transform(df['cleaned_text'])
dic_vocabulary_t = vectorizer_tfidf.vocabulary_
len(dic_vocabulary_t)

104456

In [None]:
sns.heatmap(X_train_c.todense()[:,np.random.randint(0,X.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample')

### perform feature selection

use chi-square test to select the realted fearures of label

In [None]:
## no need to run this cell
def optimize_vectorizer(vectorizer,X_train):
  y = df["label"]
  X_names = vectorizer.get_feature_names()
  p_value_limit = 0.95
  dtf_features = pd.DataFrame()

  for cat in np.unique(y):
      chi2, p = feature_selection.chi2(X_train, y==cat)
      dtf_features = dtf_features.append(pd.DataFrame(
                    {"feature":X_names, "score":1-p, "y":cat}))
      dtf_features = dtf_features.sort_values(["y","score"], 
                      ascending=[True,False])
      dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
  X_names = dtf_features["feature"].unique().tolist()
  return X_names

In [None]:
y = df["label"]
v_c = vectorizer_count.get_feature_names()
p_value_limit = 0.95
dtf_features = pd.DataFrame()
for cat in np.unique(y):
  chi2, p = feature_selection.chi2(X_train_c, y==cat)
  dtf_features = dtf_features.append(pd.DataFrame(
                    {"feature":v_c, "score":1-p, "y":cat}))
  dtf_features = dtf_features.sort_values(["y","score"], 
                      ascending=[True,False])
  dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
v_c = dtf_features["feature"].unique().tolist()

vectorizer_count = feature_extraction.text.CountVectorizer(vocabulary=v_c)

In [None]:
v_t = vectorizer_tfidf.get_feature_names()
dtf_features = pd.DataFrame()
for cat in np.unique(y):
  chi2, p = feature_selection.chi2(X_train_t, y==cat)
  dtf_features = dtf_features.append(pd.DataFrame(
                    {"feature":v_t, "score":1-p, "y":cat}))
  dtf_features = dtf_features.sort_values(["y","score"], 
                      ascending=[True,False])
  dtf_features = dtf_features[dtf_features["score"]>p_value_limit]
v_t = dtf_features["feature"].unique().tolist()

vectorizer_tfidf = feature_extraction.text.TfidfVectorizer(vocabulary=v_t)

In [None]:
vectorizer_count.fit(df['cleaned_text'])
X_train_c = vectorizer_count.transform(df['cleaned_text'])
dic_vocabulary_c = vectorizer_count.vocabulary_
len(dic_vocabulary_c)

24717

In [None]:
sns.heatmap(X_train_c.todense()[:,np.random.randint(0,X.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample for count')

In [None]:
vectorizer_tfidf.fit(df['cleaned_text'])
X_train_t = vectorizer_tfidf.transform(df['cleaned_text'])
dic_vocabulary_t = vectorizer_tfidf.vocabulary_

In [None]:
sns.heatmap(X_train_c.todense()[:,np.random.randint(0,X.shape[1],100)]==0, vmin=0, vmax=1, cbar=False).set_title('Sparse Matrix Sample for tfidf')

### embedding & classifier selection

In [None]:
def get_prediction(vectorizer, classifier, X_train, X_test, y_train, y_test):
    pipe = Pipeline([('vector', vectorizer),
                    ('model', classifier)])
    model = pipe.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuarcy: {}".format(round(accuracy_score(y_test, y_pred)*100,2)))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix: \n", cm)
    print("Classification Report: \n", classification_report(y_test, y_pred))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['label'], test_size = 0.3, random_state= 42)
classifiers = [LogisticRegression(), SGDClassifier(), MultinomialNB(), BernoulliNB(), LinearSVC(),
              KNeighborsClassifier(n_neighbors=5), DecisionTreeClassifier(), GradientBoostingClassifier(), 
               RandomForestClassifier(), XGBClassifier()]
for classifier in classifiers:
    print("\n\n", classifier)
    print("***********Usng Count Vectorizer****************")
    get_prediction(vectorizer_count, classifier, X_train, X_test, y_train, y_test)
    print("***********Usng TFIDF Vectorizer****************")
    get_prediction(vectorizer_tfidf, classifier, X_train, X_test, y_train, y_test)



 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
***********Usng Count Vectorizer****************


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuarcy: 97.39
Confusion Matrix: 
 [[7829  173]
 [ 228 7140]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      8002
           1       0.98      0.97      0.97      7368

    accuracy                           0.97     15370
   macro avg       0.97      0.97      0.97     15370
weighted avg       0.97      0.97      0.97     15370

***********Usng TFIDF Vectorizer****************
Accuarcy: 96.3
Confusion Matrix: 
 [[7736  266]
 [ 303 7065]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.97      0.96      8002
           1       0.96      0.96      0.96      7368

    accuracy                           0.96     15370
   macro avg       0.96      0.96      0.96     15370
weighted avg       0.96      0.96      0.96     15370



 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0



Accuarcy: 96.75
Confusion Matrix: 
 [[7785  217]
 [ 282 7086]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      8002
           1       0.97      0.96      0.97      7368

    accuracy                           0.97     15370
   macro avg       0.97      0.97      0.97     15370
weighted avg       0.97      0.97      0.97     15370

***********Usng TFIDF Vectorizer****************
Accuarcy: 97.16
Confusion Matrix: 
 [[7810  192]
 [ 245 7123]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.97      8002
           1       0.97      0.97      0.97      7368

    accuracy                           0.97     15370
   macro avg       0.97      0.97      0.97     15370
weighted avg       0.97      0.97      0.97     15370



 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jo

### parameter selection

In [None]:
from pprint import pprint
from time import time
import logging

from sklearn.model_selection import GridSearchCV

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    #('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

In [None]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (2, 2), (1, 2)),
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__penalty': ('l1', 'l2'),
    'clf__max_iter': (50,100,200),
    #'clf__n_neighbors': list(range(2,8,2)),
    #'clf__p': (1,2),
}

In [None]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(df['cleaned_text'], df['label'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'clf']
parameters:
{'clf__max_iter': (50, 100, 200),
 'clf__penalty': ('l1', 'l2'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 1), (2, 2), (1, 2))}
Fitting 5 folds for each of 216 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 62.0min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 151.2min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 283.4min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 411.7min finished


done in 24910.327s

Best score: 0.935
Best parameters set:
	clf__max_iter: 200
	clf__penalty: 'l2'
	vect__max_df: 0.75
	vect__max_features: None
	vect__ngram_range: (1, 2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
