In [1]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text


In [2]:
data_train = pd.read_csv('train.csv',index_col=False, encoding='iso-8859-1', 
                                warn_bad_lines=True, error_bad_lines=False)
data_test = pd.read_csv('test.csv',index_col=False, encoding='iso-8859-1', 
                                warn_bad_lines=True, error_bad_lines=False)

In [3]:
frames = [data_train, data_test]

df1 = pd.concat(frames)
df2 = pd.concat(frames)

In [4]:
df1.head()

Unnamed: 0,label,content
0,0,hide new secretions from the parental units
1,0,"contains no wit , only labored gags"
2,1,that loves its characters and communicates som...
3,0,remains utterly satisfied to remain the same t...
4,0,on the worst revenge-of-the-nerds clich??s the...


In [5]:
df1.label[df1.label == 1]="positive"
df1.label[df1.label == 0]="negative"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.label[df1.label == 1]="positive"


In [6]:
df2.label[df2.label == 1]="positive"
df2.label[df2.label == 0]="negative"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.label[df2.label == 1]="positive"


In [7]:
df1.head()

Unnamed: 0,label,content
0,negative,hide new secretions from the parental units
1,negative,"contains no wit , only labored gags"
2,positive,that loves its characters and communicates som...
3,negative,remains utterly satisfied to remain the same t...
4,negative,on the worst revenge-of-the-nerds clich??s the...


In [8]:
# taking negative sentiment and deleting positive
df1 = df1[~df1.label.str.contains("positive")]

In [9]:
df1.head()

Unnamed: 0,label,content
0,negative,hide new secretions from the parental units
1,negative,"contains no wit , only labored gags"
3,negative,remains utterly satisfied to remain the same t...
4,negative,on the worst revenge-of-the-nerds clich??s the...
5,negative,that 's far too tragic to merit such superfici...


In [10]:
# randomly picking 5000 rows
df1 = df1.sample(n=5000, replace=True)

In [11]:
df1["label"].value_counts()

negative    5000
Name: label, dtype: int64

In [12]:
#taking positive senitment and deleting negative
df2 = df2[~df2.label.str.contains("negative")]

In [13]:
df2.head()

Unnamed: 0,label,content
2,positive,that loves its characters and communicates som...
6,positive,demonstrates that the director of such hollywo...
7,positive,of saucy
9,positive,are more deeply thought through than in most `...
15,positive,the greatest musicians


In [14]:
# randomly picking 5000 rows
df2 = df2.sample(n=5000, replace=True)

In [15]:
df2["label"].value_counts()

positive    5000
Name: label, dtype: int64

In [16]:
#concating positive and negative sentiment
frames = [df1, df2]

df = pd.concat(frames)

In [17]:
df.head()

Unnamed: 0,label,content
65307,negative,", it 's contrived and predictable"
29480,negative,sure to ultimately disappoint the action fans ...
39814,negative,a hole in your head
20313,negative,failings
18187,negative,such a stultifying


In [18]:
df["label"].value_counts()

negative    5000
positive    5000
Name: label, dtype: int64

In [19]:
class_names = ['positive', 'negative']

In [20]:
# replacing positive sentiment with 1 and negative with 0
encoding = {
    
    'positive': 1,
    'negative': 0
}

In [21]:
content = df['content'].values.tolist()
label = df['label'].tolist()

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(content, label, test_size=.2)

In [23]:
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

In [24]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


In [25]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [26]:
learner1 = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [27]:
learner1.fit_onecycle(2e-5, 3)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x24b08cd6520>

In [None]:
learner1.validate(val_data=(x_test, y_test), class_names=class_names)