In [1]:
# !pip install -r requirements.txt
# !camel_data light
# import nltk
# nltk.download('stopwords')

In [79]:
# built in modules
import re
import random
import numpy as np
import pandas as pd
from sklearn import metrics 

# .py files 
from preprocess import Preprocess
from feature_extraction import BOW
from feature_extraction import TFIDF
from feature_extraction import CBOW
from feature_extraction import SG

from classical_models import SVMmodel
# Can use this library to reload a specific module if the notebook can't see changes in the imported module
import importlib

Stance detection labels meaning is as follows:
1. Positive (1): means that the tweet author encourages and supports vaccination.
2. Negative (-1): means that the tweet author refuses vaccination.
3. Neutral (0): means that the tweet neither supports nor refuses vaccination.

Category labels meaning is as follows:
1. Info_News: Information about vaccination.
2. Celebrities: mentioning celebrities taking vaccinations.
3. Plan: Governmental plan or progress of vaccination.
4. Request: Requests from governments regarding the vaccination process.
5. Rumor: the tweet is a rumor.
6. Advice: Advice related to the virus or the vaccination
7. Restriction: Restrictions due to the virus e.g. traveling.
8. Personal: Personal opinion or story about vaccination.
9. Unrelated: Unrelated to vaccination.
10.Others: Vaccination related but not one of the above.

In [23]:
t = pd.read_csv('./Dataset/train.csv')
d = pd.read_csv('./Dataset/dev.csv')


In [24]:
preprocess = Preprocess()
#
categories = ["info_news", "celebrity", "plan", "requests", "rumors", "advice", "restrictions", "personal", "unrelated", "others"]

Emojis:  True
Lemmatizor:  camel


In [25]:
# we have 9 categories so we have an array of len 9 for each y
def encode_category(y):
    '''
    Input: y a list of string labels for the category of each document 
    Output: a list of encoded 10 sized array for the category of each doc 
            for "other" category , it has an array =[0 0 0 0 0 0 0 0 0 1] 
    '''
    return [categories.index(ele) for ele in y]

### Understanding the Data
- 80% tweets are positive (class 1), the rest are neutral and negative.
- 50% tweets belongs to info_news category.

In [26]:
# analyze dataset
print(t.info())
# d
# print(d.info())

# count for each label
print("####################################")
print("counts for each lablel :")
print(t['category'].value_counts(normalize=True))
# same for d
# print(d['category'].value_counts(normalize=True))

# count for stance labels
print("####################################")
print("counts for each stance label :")
print(t['stance'].value_counts(normalize=True))
# d
print(d['stance'].value_counts(normalize=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6988 entries, 0 to 6987
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      6988 non-null   object
 1   category  6988 non-null   object
 2   stance    6988 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 163.9+ KB
None
####################################
counts for each lablel :
info_news       0.517459
personal        0.146680
celebrity       0.139525
plan            0.086720
unrelated       0.046222
others          0.023898
requests        0.016027
rumors          0.011305
advice          0.009588
restrictions    0.002576
Name: category, dtype: float64
####################################
counts for each stance label :
 1    0.792501
 0    0.144820
-1    0.062679
Name: stance, dtype: float64
 1    0.804
 0    0.126
-1    0.070
Name: stance, dtype: float64


In [27]:
X = t['text']
ys = t['stance']
yc = t['category']

In [28]:
print("####################################")
print("duplicated rows :")
print(t[t.duplicated()])

####################################
duplicated rows :
                                                   text   category  stance
529   اللقاح لازم ينحفظ بدرجة حرارة ٧٠ دون الصفر وال...  info_news       1
740   هناك مصادر لقاح أثبتت نجاحها غير أمريكا ويمكن ...  info_news       1
963   لو كان اللقاح الأمريكي أمن ومفيد لما أصبحت الو...   personal      -1
977   اللقاح الأمريكي لم ينجح في الاختبارات السريرية...  info_news      -1
1684    #نريد_لقاح_آمن حتى نأمن صحة اطفالنا في المستقبل   requests       1
1878  هناك مصادر لقاح أثبتت نجاحها غير أمريكا ويمكن ...  info_news       1
2477  محمد بن زايد: الإمارات قدمت أكثر من مليون و 27...  info_news       1
2644         نريد التخلص من كورونا لكننا #نريد_لقاح_آمن   requests       1
2646  في ظل تسابق دول العالم على تطعيم شعوبها.. تَعَ...  info_news       1
2786  لو كان اللقاح الأمريكي أمن ومفيد لما أصبحت الو...   personal      -1
3231  #نريد_لقاح_آمن حتى ترجع الحياة الى طبيعتها بعد...  info_news       1
3728    #نريد_لقاح_آمن حتى نأمن صحة اطفالنا ف

In [29]:
X = t['text']
ys = t['stance']
yc = t['category']

In [30]:
print(yc[0:10])
yc=encode_category(yc)
print(yc[0:10])

0    celebrity
1    info_news
2    info_news
3    celebrity
4     personal
5    info_news
6    info_news
7     personal
8    unrelated
9    info_news
Name: category, dtype: object
[1, 0, 0, 1, 7, 0, 0, 7, 8, 0]


In [31]:
X_dev=d['text']
ys_dev=d['stance']
yc_dev=d['category']

In [32]:
yc_dev=encode_category(yc_dev)

In [33]:
# Preprocess the test data
X = X.apply(preprocess.do_all)

In [34]:
# Preprocess the dev data
X_dev = X_dev.apply(preprocess.do_all)

In [35]:
r = random.randint(0, len(X))
# r = 900
print(r, '\n')
print(t.text[r], '\n')
print(X[r], '\n')

4298 

للأسف حالات #فيروس_كورونا بدأت تتصاعد في السلطنة<LF>ويجب الالتزام بالعادات الصحية حتى نتجنب أي إغلاق قريب في الأنشطة لاسمح الله<LF>#عمان_تواجه_كورونا #عُمان_نهضة_متجددة #فيروس_كورونا #لقاح_كوفيد_19 #عمان_تاريخ_وحضارة #باحثون_عن_عمل_يستغيثون271 #عاطلون_في_بلد_النفط #صباح_الخير #الحمدلله https://t.co/bSrrXe3Otl 

['أَسَف', 'حالَة', 'بَدَأ', 'تَصاعَد', 'فِي', 'سَلْطَنَة', 'وَجَب', 'ٱِلْتِزام', 'عادَة', 'صِحِّيّ', 'حَتَّى', 'تَجَنَّب', 'أَيّ', 'إِغْلاق', 'قَرِيب', 'فِي', 'نَشاط', 'سَمَح', 'اللَّه', '<LF>', '<LF>', '<NUM>', '<NUM>', '<LINK>', 'فَيْرُوس', 'فَيْرُوس', 'فَيْرُوس', 'كَوَّر', 'كَوَّر', 'كَوَّر', 'عَمّان', 'عَمّان', 'عَمّان', 'واجَه', 'واجَه', 'واجَه', 'كَوَّر', 'كَوَّر', 'كَوَّر', 'عَمّان', 'عَمّان', 'عَمّان', 'نَهْضَة', 'نَهْضَة', 'نَهْضَة', 'مُتَجَدِّد', 'مُتَجَدِّد', 'مُتَجَدِّد', 'فَيْرُوس', 'فَيْرُوس', 'فَيْرُوس', 'كَوَّر', 'كَوَّر', 'كَوَّر', 'لَقاح', 'لَقاح', 'لَقاح', 'كوفيد', 'كوفيد', 'كوفيد', 'عَمّان', 'عَمّان', 'عَمّان', 'تارِيخ', 'تارِيخ', 'تارِيخ', 'حَضارَة', 

In [36]:
r = random.randint(0, len(X_dev))
r = 150
# r = 738
print(r, '\n')
print(d.text[r], '\n')
print(X_dev[r], '\n')

150 

تساؤلات آخر ليل <LF>اذا ما كفانا لقاح الكورونا ...بزيدولو مي ؟؟ 😅<LF>#هبل 

['تَساؤُل', 'آخَر', 'لَيْل', 'إِذا', 'كَفَى', 'لَقاح', 'الكورونا', 'بزيدولو', 'مَيّ', '<', 'smiling', '_', 'face', '_', 'with', '_', 'open', '_', 'mouth', '_', '_', 'cold', '_', 'sweat', '>', '<LF>', '<LF>', 'أَهْبَل', 'أَهْبَل', 'أَهْبَل'] 



In [37]:
# Print Vocabulary size
lst = [word for x in X for word in x]
vocab = set(lst)
print(len(lst))
print(len(vocab))

209605
11771


### Getting the train , test and dev features

In [38]:
train_features_bow,test_features_bow=BOW(train_documents=X,test_documents=X_dev)
#
assert train_features_bow.shape[0] == len(X)
assert train_features_bow.shape[1] == len(vocab)

In [39]:
train_features_tfidf,test_features_tfidf=TFIDF(train_documents=X,test_documents=X_dev)
#
assert train_features_tfidf.shape[0] == len(X)
assert train_features_tfidf.shape[1] == len(vocab)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer =TfidfVectorizer(analyzer=lambda x: x)
train_features= vectorizer.fit_transform(X)
# return train_features.toarray(), test_features.toarray()

# print 10 words with highest tfidf values for each CATEGORY
feature_names = vectorizer.get_feature_names_out()
for i, category in enumerate(categories):
    lst = [0] *len(feature_names)
    for row in train_features[np.array(yc) == i].toarray():
        lst = [x + y for x, y in zip(lst, row)]
    # top 10 words
    top20 = np.argsort(lst)[::-1][:20]
    # print names and values
    print(category)
    print([(feature_names[j], lst[j]) for j in top20])
    print()

info_news
[('كَوَّر', 261.4992797735222), ('<LF>', 256.44244823851636), ('لَقاح', 208.74808200516057), ('<NUM>', 173.80818903285686), ('<LINK>', 156.12755324107326), ('مِن', 137.00146302072264), ('فِي', 131.46230664440768), ('عاجِل', 99.37734597401408), ('عَلَى', 97.67586237415485), ('صِحَّة', 92.1865397652484), ('فايزر', 79.82700837675176), ('فَيْرُوس', 78.7697622503078), ('جُرْعَة', 75.59346302671747), ('تَطْعِيم', 71.25956737705488), ('عَرَبِيّ', 67.87505718122213), ('أَنَّ', 67.23948489544064), ('أَمْن', 65.6379417933991), ('كوفيد', 64.87517065417268), ('إِلَى', 59.38200948876755), ('أَوَّل', 56.99642908165569)]

celebrity
[('تَلَقَّى', 108.58172601829166), ('كَوَّر', 106.75747663142474), ('مَلِك', 85.0017931588784), ('خادِم', 81.40516820739016), ('حَرَم', 80.68576686550303), ('<LF>', 80.58930520830215), ('شَرِيف', 75.25967006899324), ('لَقاح', 65.7817740083004), ('<LINK>', 59.74439388913578), ('جُرْعَة', 59.601509573019676), ('أَوَّل', 56.22555863819366), ('مِن', 48.11858498092079

In [41]:
train_features_cbow,test_features_cbow=CBOW(train_documents=X,test_documents=X_dev)

In [80]:
train_features_sg,test_features_sg=SG(train_documents=X,test_documents=X_dev)

## Classical model Training 

- Classification Task Results:

|  | SVM | Random Forest | KNN |  Voting system |
| --------------- | --------------- | --------------- | --------------- | --------------- |
| TFIDF |69%  |  |  ||
| BOW | 54%  |   |   | |


- Stance Task Results:

|  | SVM | Random Forest | KNN |  Voting system |
| --------------- | --------------- | --------------- | --------------- | --------------- |
| TFIDF |81%  |  |  ||
| BOW | 80%  |   |   | |

In [58]:
# without sg
train_features = np.concatenate((train_features_bow, train_features_tfidf,train_features_cbow), axis=1)
test_features = np.concatenate((test_features_bow, test_features_tfidf,test_features_cbow), axis=1)

# with sg
train_features1 = np.concatenate((train_features_bow, train_features_tfidf,train_features_cbow,train_features_sg), axis=1)
test_features1 = np.concatenate((test_features_bow, test_features_tfidf,test_features_cbow,test_features_sg), axis=1)

# print(train_features_cbow.shape)
# print(train_features.shape)

In [21]:
# clss = [NBmodel, KNNmodelو SVMmodel]
# clss = [KNNmodel]
clss = [LRmodel]
#
for cls in clss:
    #
    N = 1000
    N = len(train_features)
    yc_pred = cls(Xtrain=train_features[:N], y_train=yc[:N], X_test=test_features)
    # ys_pred = cls(Xtrain=train_features, y_train=ys, X_test=test_features)
    #
    print('========= Category =========')
    print(metrics.classification_report(y_true=yc_dev,y_pred=yc_pred))
    print('========= Stance =========')
    # print(metrics.classification_report(y_true=ys_dev,y_pred=ys_pred))

# classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_pred)
# stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_pred)

              precision    recall  f1-score   support

           0       0.69      0.78      0.73       545
           1       0.85      0.81      0.83       145
           2       0.22      0.16      0.19        82
           3       0.33      0.10      0.15        20
           4       0.00      0.00      0.00        15
           5       0.50      0.10      0.17        10
           6       0.00      0.00      0.00         2
           7       0.47      0.51      0.49       128
           8       0.50      0.42      0.45        36
           9       0.12      0.06      0.08        17

    accuracy                           0.64      1000
   macro avg       0.37      0.29      0.31      1000
weighted avg       0.61      0.64      0.62      1000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**SVM**

In [None]:
yc_pred=SVMmodel(Xtrain=train_features_bow,y_train=yc,X_test=test_features_bow)

In [None]:
ys_pred=SVMmodel(Xtrain=train_features_tfidf,y_train=ys,X_test=test_features_tfidf)

In [63]:
yc_cbow_pred=SVMmodel(Xtrain=train_features_cbow,y_train=yc,X_test=test_features_cbow)

In [64]:
ys_cbow_pred=SVMmodel(Xtrain=train_features_cbow,y_train=ys,X_test=test_features_cbow)

In [69]:
classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_cbow_pred)
stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_cbow_pred)


print("The classification accuracy after training on svm on all features : ",classify_accuracy)
print("The stance accuracy after training on svm on all features : ",stance_accuracy)

# F1 score
print('========= Category =========')
print(metrics.classification_report(y_true=yc_dev,y_pred=yc_cbow_pred))
print('========= Stance =========')
print(metrics.classification_report(y_true=ys_dev,y_pred=ys_cbow_pred))

The classification accuracy after training on svm on all features :  0.545
The stance accuracy after training on svm on all features :  0.804
              precision    recall  f1-score   support

           0       0.55      1.00      0.71       545
           1       0.00      0.00      0.00       145
           2       0.00      0.00      0.00        82
           3       0.00      0.00      0.00        20
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00       128
           8       0.00      0.00      0.00        36
           9       0.00      0.00      0.00        17

    accuracy                           0.55      1000
   macro avg       0.05      0.10      0.07      1000
weighted avg       0.30      0.55      0.38      1000

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
yc_sg_pred=SVMmodel(Xtrain=train_features_sg,y_train=yc,X_test=test_features_sg)

In [82]:
ys_sg_pred=SVMmodel(Xtrain=train_features_sg,y_train=ys,X_test=test_features_sg)

In [83]:
classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_sg_pred)
stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_sg_pred)


print("The classification accuracy after training on svm on all features : ",classify_accuracy)
print("The stance accuracy after training on svm on all features : ",stance_accuracy)

# F1 score
print('========= Category =========')
print(metrics.classification_report(y_true=yc_dev,y_pred=yc_sg_pred))
print('========= Stance =========')
print(metrics.classification_report(y_true=ys_dev,y_pred=ys_sg_pred))

The classification accuracy after training on svm on all features :  0.545
The stance accuracy after training on svm on all features :  0.804
              precision    recall  f1-score   support

           0       0.55      1.00      0.71       545
           1       0.00      0.00      0.00       145
           2       0.00      0.00      0.00        82
           3       0.00      0.00      0.00        20
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00       128
           8       0.00      0.00      0.00        36
           9       0.00      0.00      0.00        17

    accuracy                           0.55      1000
   macro avg       0.05      0.10      0.07      1000
weighted avg       0.30      0.55      0.38      1000

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
# features: bow, tfidf, cbow
yc_all_pred=SVMmodel(Xtrain=train_features,y_train=yc,X_test=test_features)

In [55]:
# features: bow, tfidf, cbow
ys_all_pred=SVMmodel(Xtrain=train_features,y_train=ys,X_test=test_features)

In [56]:
# classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_pred)
# stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_pred)

# classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_cbow_pred)
# stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_cbow_pred)

classify_accuracy=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_all_pred)
stance_accuracy=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_all_pred)


print("The classification accuracy after training on svm on all features : ",classify_accuracy)
print("The stance accuracy after training on svm on all features : ",stance_accuracy)

The classification accuracy after training on svm on all features :  0.648
The stance accuracy after training on svm on all features :  0.819


In [60]:
# F1 score
print('========= Category =========')
print(metrics.classification_report(y_true=yc_dev,y_pred=yc_all_pred))
print('========= Stance =========')
print(metrics.classification_report(y_true=ys_dev,y_pred=ys_all_pred))

              precision    recall  f1-score   support

           0       0.62      0.97      0.76       545
           1       0.90      0.43      0.58       145
           2       0.00      0.00      0.00        82
           3       0.00      0.00      0.00        20
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         2
           7       0.69      0.38      0.49       128
           8       0.73      0.31      0.43        36
           9       0.00      0.00      0.00        17

    accuracy                           0.65      1000
   macro avg       0.29      0.21      0.23      1000
weighted avg       0.58      0.65      0.58      1000

              precision    recall  f1-score   support

          -1       0.67      0.09      0.15        70
           0       0.53      0.22      0.31       126
           1       0.84      0.98      0.90       804

    accuracy           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
# features: bow, tfidf, cbow, sg
yc_all_pred1=SVMmodel(Xtrain=train_features1,y_train=yc,X_test=test_features1)

In [84]:
# features: bow, tfidf, cbow, sg
ys_all_pred1=SVMmodel(Xtrain=train_features1,y_train=ys,X_test=test_features1)

In [88]:

classify_accuracy1=metrics.accuracy_score(y_true=yc_dev,y_pred=yc_all_pred1)
stance_accuracy1=metrics.accuracy_score(y_true=ys_dev,y_pred=ys_all_pred1)


print("The classification accuracy after training on svm on all features : ",classify_accuracy1)
print("The stance accuracy after training on svm on all features : ",stance_accuracy1)

The classification accuracy after training on svm on all features :  0.63
The stance accuracy after training on svm on all features :  0.815


In [89]:
# F1 score
print('========= Category =========')
print(metrics.classification_report(y_true=yc_dev,y_pred=yc_all_pred))
print('========= Stance =========')
print(metrics.classification_report(y_true=ys_dev,y_pred=ys_all_pred))

              precision    recall  f1-score   support

           0       0.62      0.97      0.76       545
           1       0.90      0.43      0.58       145
           2       0.00      0.00      0.00        82
           3       0.00      0.00      0.00        20
           4       0.00      0.00      0.00        15
           5       0.00      0.00      0.00        10
           6       0.00      0.00      0.00         2
           7       0.69      0.38      0.49       128
           8       0.73      0.31      0.43        36
           9       0.00      0.00      0.00        17

    accuracy                           0.65      1000
   macro avg       0.29      0.21      0.23      1000
weighted avg       0.58      0.65      0.58      1000

              precision    recall  f1-score   support

          -1       0.67      0.09      0.15        70
           0       0.53      0.22      0.31       126
           1       0.84      0.98      0.90       804

    accuracy           

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
# F1 score
print('========= Category =========')
print(metrics.classification_report(y_true=yc_dev,y_pred=yc_all_pred))
print('========= Stance =========')
print(metrics.classification_report(y_true=ys_dev,y_pred=ys_all_pred))

              precision    recall  f1-score   support

          -1       0.67      0.09      0.15        70
           0       0.53      0.22      0.31       126
           1       0.84      0.98      0.90       804

    accuracy                           0.82      1000
   macro avg       0.68      0.43      0.46      1000
weighted avg       0.79      0.82      0.77      1000



In [None]:
from torch import nn
# def initLSTM():
vocab_size = len(vocab)
embedding_dim = 100 #need to be tuned
hidden_size = 50 #need to be tuned

embedding = nn.Embedding(vocab_size, embedding_dim)

lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

n_classes_category = 9
n_classes_stance = 3

linear_category = nn.Linear(hidden_size, n_classes_category)
linear_stance = nn.Linear(hidden_size, n_classes_stance)


In [None]:
def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    # (1) Pass the sentences through the embedding layer
    # (2) Pass the output of the embedding layer through the LSTM layer
    # (3) Pass the output of the LSTM layer through the linear layer
    # (4) Apply softmax to the output of the linear layer
    # (5) Return the output of the softmax layer
    sentences = embedding(sentences)
    sentences, _ = lstm(sentences)
    sentences_category = linear_category(sentences)
    sentences_stance = linear_stance(sentences)
    final_output_category = sentences_category
    final_output_stance = sentences_stance
    ###############################################################################################
    return final_output_category, final_output_stance

In [None]:
# from keras.models import Sequential
# MAX_SEQ_LEN = 1000

# def model_1():
#     model = Sequential()
#     model.add(Embedding(input_dim = (len(tokenizer.word_counts) + 1), output_dim = 128, input_length = MAX_SEQ_LEN))
#     model.add(LSTM(128))
#     model.add(Dense(64, activation='relu'))
#     model.add(Dense(3, activation='softmax'))
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     return model

# m1 = train(model_1, 
#            train_text_vec,
#            y_train,
#            test_text_vec,
#            y_test,
#            checkpoint_path='model_1.h5',
#            class_weights=cws
#           )