In [1]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text


In [2]:
data_train = pd.read_csv('train.csv',index_col=False, encoding='iso-8859-1', 
                                warn_bad_lines=True, error_bad_lines=False)
data_test = pd.read_csv('test.csv',index_col=False, encoding='iso-8859-1', 
                                warn_bad_lines=True, error_bad_lines=False)

In [3]:
frames = [data_train, data_test]

df = pd.concat(frames)

In [4]:
df.head()

Unnamed: 0,label,content
0,0,hide new secretions from the parental units
1,0,"contains no wit , only labored gags"
2,1,that loves its characters and communicates som...
3,0,remains utterly satisfied to remain the same t...
4,0,on the worst revenge-of-the-nerds clich??s the...


In [5]:
df["label"].value_counts()

1    38922
0    31120
Name: label, dtype: int64

In [6]:
class_names = ['positive', 'negative']

In [7]:
encoding = {
    
    'positive': 1,
    'negative': 0
}

In [8]:
reviews = df['content'].values.tolist()
sentiments = df['label'].tolist()

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews, sentiments, test_size=.2)

In [10]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


In [11]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [12]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [13]:
learner.fit_onecycle(2e-5, 3)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1ca8003b7f0>

In [14]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

    positive       0.94      0.94      0.94      6258
    negative       0.95      0.95      0.95      7751

    accuracy                           0.95     14009
   macro avg       0.95      0.95      0.95     14009
weighted avg       0.95      0.95      0.95     14009



array([[5904,  354],
       [ 362, 7389]], dtype=int64)