In [1]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text

In [2]:
df1=pd.read_csv("IMDB dataset.csv")
df2=pd.read_csv("IMDB dataset.csv")

In [3]:
df1.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df1.groupby("sentiment").describe()

Unnamed: 0_level_0,review,review,review,review
Unnamed: 0_level_1,count,unique,top,freq
sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
negative,25000,24698,This show comes up with interesting locations ...,3
positive,25000,24884,Loved today's show!!! It was a variety and not...,5


In [5]:
# taking negative sentiment and deleting positive
df1 = df1[~df1.sentiment.str.contains("positive")]

In [6]:
df1.head()

Unnamed: 0,review,sentiment
3,Basically there's a family where a little boy ...,negative
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
10,Phil the Alien is one of those quirky films wh...,negative
11,I saw this movie when I was about 12 when it c...,negative


In [7]:
# randomly picking 2500 rows
df1 = df1.sample(n=2500, replace=True)

In [8]:
df1["sentiment"].value_counts()

negative    2500
Name: sentiment, dtype: int64

In [9]:
#taking positive senitment and deleting negative
df2 = df2[~df2.sentiment.str.contains("negative")]

In [10]:
df2.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive


In [11]:
# randomly picking 2500 rows
df2 = df2.sample(n=2500, replace=True)

In [12]:
df2["sentiment"].value_counts()

positive    2500
Name: sentiment, dtype: int64

In [13]:
#concating positive and negative sentiment
frames = [df1, df2]

df = pd.concat(frames)


In [14]:
df.head()

Unnamed: 0,review,sentiment
45105,"What a mess--and I'm not referring to the ""des...",negative
27364,"A Brazilian cable television is presenting ""Ya...",negative
177,"Though I'd heard that ""Cama de Gato"" was the w...",negative
9043,Firstly let me get this of my chest I hate Oct...,negative
2217,I saw this film at SXSW with the director in a...,negative


In [15]:
df["sentiment"].value_counts()

negative    2500
positive    2500
Name: sentiment, dtype: int64

In [16]:
class_names = ['positive', 'negative']

In [17]:
# replacing positive sentiment with 1 and negative with 0
encoding = {
    
    'positive': 1,
    'negative': 0
}

In [18]:
reviews = df['review'].values.tolist()
sentiments = df['sentiment'].tolist()

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews, sentiments, test_size=.2)

In [20]:
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

In [21]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


In [22]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [23]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [24]:
learner.fit_onecycle(2e-5, 3)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x2ca00044280>

In [25]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

    positive       0.92      0.90      0.91       516
    negative       0.89      0.92      0.91       484

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000



array([[463,  53],
       [ 39, 445]], dtype=int64)

In [26]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

['positive', 'negative']

In [33]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

    positive       0.98      0.98      0.98       251
    negative       0.98      0.98      0.98       249

    accuracy                           0.98       500
   macro avg       0.98      0.98      0.98       500
weighted avg       0.98      0.98      0.98       500



array([[246,   5],
       [  5, 244]], dtype=int64)

In [34]:
learner.fit_onecycle(2e-6, 3)



begin training using onecycle policy with max lr of 2e-06...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1f080bd69d0>

In [35]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

    positive       0.97      0.98      0.98       251
    negative       0.98      0.97      0.98       249

    accuracy                           0.98       500
   macro avg       0.98      0.98      0.98       500
weighted avg       0.98      0.98      0.98       500



array([[246,   5],
       [  7, 242]], dtype=int64)

In [36]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

['positive', 'negative']

In [39]:
import time 

message = 'I am depressed'

start_time = time.time() 
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

predicted: negative (0.34)


In [40]:
predictor.save("models/bert_model")