In [39]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text


In [40]:
data_train = pd.read_csv('train.csv',index_col=False, encoding='iso-8859-1', 
                                warn_bad_lines=True, error_bad_lines=False)
data_test = pd.read_csv('test.csv',index_col=False, encoding='iso-8859-1', 
                                warn_bad_lines=True, error_bad_lines=False)

In [41]:
frames = [data_train, data_test]

df1 = pd.concat(frames)
df2 = pd.concat(frames)

In [42]:
df1.head()

Unnamed: 0,label,content
0,0,hide new secretions from the parental units
1,0,"contains no wit , only labored gags"
2,1,that loves its characters and communicates som...
3,0,remains utterly satisfied to remain the same t...
4,0,on the worst revenge-of-the-nerds clich??s the...


In [43]:
df1.label[df1.label == 1]="positive"
df1.label[df1.label == 0]="negative"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.label[df1.label == 1]="positive"


In [44]:
df2.label[df2.label == 1]="positive"
df2.label[df2.label == 0]="negative"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.label[df2.label == 1]="positive"


In [45]:
df1.head()

Unnamed: 0,label,content
0,negative,hide new secretions from the parental units
1,negative,"contains no wit , only labored gags"
2,positive,that loves its characters and communicates som...
3,negative,remains utterly satisfied to remain the same t...
4,negative,on the worst revenge-of-the-nerds clich??s the...


In [46]:
# taking negative sentiment and deleting positive
df1 = df1[~df1.label.str.contains("positive")]

In [47]:
df1.head()

Unnamed: 0,label,content
0,negative,hide new secretions from the parental units
1,negative,"contains no wit , only labored gags"
3,negative,remains utterly satisfied to remain the same t...
4,negative,on the worst revenge-of-the-nerds clich??s the...
5,negative,that 's far too tragic to merit such superfici...


In [48]:
# randomly picking 1250 rows
df1 = df1.sample(n=1250, replace=True)

In [49]:
df1["label"].value_counts()

negative    1250
Name: label, dtype: int64

In [50]:
#taking positive senitment and deleting negative
df2 = df2[~df2.label.str.contains("negative")]

In [51]:
df2.head()

Unnamed: 0,label,content
2,positive,that loves its characters and communicates som...
6,positive,demonstrates that the director of such hollywo...
7,positive,of saucy
9,positive,are more deeply thought through than in most `...
15,positive,the greatest musicians


In [52]:
# randomly picking 1250 rows
df2 = df2.sample(n=1250, replace=True)

In [53]:
df2["label"].value_counts()

positive    1250
Name: label, dtype: int64

In [54]:
#concating positive and negative sentiment
frames = [df1, df2]

df = pd.concat(frames)

In [55]:
df.head()

Unnamed: 0,label,content
1815,negative,disappointing in comparison to other recent wa...
42907,negative,is the recording industry in the current clima...
61077,negative,ms. shreve 's novel proved too difficult a tex...
43881,negative,'s nothing remotely topical or sexy here .
9447,negative,veers like a drunken driver


In [56]:
df["label"].value_counts()

negative    1250
positive    1250
Name: label, dtype: int64

In [57]:
class_names = ['positive', 'negative']

In [58]:
# replacing positive sentiment with 1 and negative with 0
encoding = {
    
    'positive': 1,
    'negative': 0
}

In [59]:
content = df['content'].values.tolist()
label = df['label'].tolist()

In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(content, label, test_size=.2)

In [61]:
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

In [62]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


In [63]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [64]:
learner3 = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [65]:
learner3.fit(2e-5, 4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x2618b7c73d0>

In [66]:
learner3.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

    positive       0.92      0.86      0.89       253
    negative       0.86      0.93      0.89       247

    accuracy                           0.89       500
   macro avg       0.89      0.89      0.89       500
weighted avg       0.89      0.89      0.89       500



array([[217,  36],
       [ 18, 229]], dtype=int64)