# BERT

In [7]:
# Importing the necessary modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Use read_csv 
df_bert = pd.read_csv('../data/datasets/SMSSpamCollection', sep = '\t', names = ['label', 'message'])
df_bert.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
labelencoder = LabelEncoder()
df_bert['label'] = labelencoder.fit_transform(df_bert['label'])
df_bert.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# Creating training dataframe according to BERT by adding the required columns
df_bert = pd.DataFrame({
    'id':range(len(df_bert)),
    'label':df_bert['label'],
    'alpha':['a']*df_bert.shape[0],
    'text': df_bert['message'].replace(r'\n', ' ', regex=True)
})
df_bert.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,"Go until jurong point, crazy.. Available only ..."
1,1,0,a,Ok lar... Joking wif u oni...
2,2,1,a,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,0,a,U dun say so early hor... U c already then say...
4,4,0,a,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
# Splitting data into train and test
df_bert_train, df_bert_test = train_test_split(df_bert, test_size=0.2)
df_bert_train, df_bert_dev = train_test_split(df_bert_train, test_size = 0.01)
df_bert_train.head(), df_bert_test.head(), df_bert_dev.head()

(        id  label alpha                                               text
 2000  2000      0     a      LMAO where's your fish memory when I need it?
 2543  2543      0     a                        Aiyo please ü got time meh.
 3639  3639      1     a  Customer service announcement. We recently tri...
 743    743      0     a  Just got up. have to be out of the room very s...
 4694  4694      0     a  Tessy..pls do me a favor. Pls convey my birthd...,
         id  label alpha                                               text
 2263  2263      0     a               It should take about  &lt;#&gt;  min
 165    165      1     a  BangBabes Ur order is on the way. U SHOULD rec...
 3294  3294      0     a  A little. Meds say take once every 8 hours. It...
 3656  3656      0     a                    Senthil group company Apnt 5pm.
 1036  1036      0     a  Hello baby, did you get back to your mom's ? A...,
         id  label alpha                                               text
 5455  545

In [None]:
# Saving dataframes to .tsv format as required by BERT
df_bert_train.to_csv('data/train.tsv', sep='\t', index=False, header=False)
df_bert_dev.to_csv('data/dev.tsv', sep='\t', index=False, header=False)
df_bert_test.to_csv('data/test.tsv', sep='\t', index=False, header=False)