# Text classification using hugging face

In [1]:
!pip install --upgrade pip -q
!pip install -q ktrain

[K     |████████████████████████████████| 2.1 MB 23.8 MB/s 
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.3/22.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m263.7/263.7 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m92.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings, gc
warnings.filterwarnings("ignore")

# Tensorflow
import tensorflow as tf

# ktrain
import ktrain
from ktrain import text

# sklearn
from sklearn.model_selection import train_test_split

In [10]:
%tensorflow_version 2.x
import timeit

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')

def cpu():
  with tf.device('/cpu:0'):
    random_image_cpu = tf.random.normal((100, 100, 100, 3))
    net_cpu = tf.keras.layers.Conv2D(32, 7)(random_image_cpu)
    return tf.math.reduce_sum(net_cpu)

def gpu():
  with tf.device('/device:GPU:0'):
    random_image_gpu = tf.random.normal((100, 100, 100, 3))
    net_gpu = tf.keras.layers.Conv2D(32, 7)(random_image_gpu)
    return tf.math.reduce_sum(net_gpu)
  
# We run each op once to warm up; see: https://stackoverflow.com/a/45067900
cpu()
gpu()

# Run the op several times.
print('Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images '
      '(batch x height x width x channel). Sum of ten runs.')
print('CPU (s):')
cpu_time = timeit.timeit('cpu()', number=10, setup="from __main__ import cpu")
print(cpu_time)
print('GPU (s):')
gpu_time = timeit.timeit('gpu()', number=10, setup="from __main__ import gpu")
print(gpu_time)
print('GPU speedup over CPU: {}x'.format(int(cpu_time/gpu_time)))

Time (s) to convolve 32x7x7x3 filter over random 100x100x100x3 images (batch x height x width x channel). Sum of ten runs.
CPU (s):
2.9256800500000963
GPU (s):
0.041626244999974915
GPU speedup over CPU: 70x


In [3]:
df = pd.read_csv('finaldata_party.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,speaker,text,type,election_year,date,candidate,result,party
0,0,John Kennedy,I uh – said that Ive served this country for f...,Pres,1960.0,1960-10-21,1.0,1,D
1,1,John Kennedy,"Mr. Howe, Mr. Vice President. First uh – let m...",Pres,1960.0,1960-10-21,1.0,1,D
2,2,Richard Nixon,"Mr. Howe, Senator Kennedy, my fellow Americans...",Pres,1960.0,1960-10-21,1.0,0,R
3,3,John Kennedy,"Good evening, Mr. Howe.",Pres,1960.0,1960-10-21,1.0,1,D
4,4,Richard Nixon,"Good evening, Mr. Howe.",Pres,1960.0,1960-10-21,1.0,0,R


In [5]:
df.loc[df['party']=='D', 'party']=1
df.loc[df['party']=='R', 'party']=0
df = df[(df['party']==1)|(df['party']==0)]
df_tr = df.loc[df['election_year']>2007]
df_tr.head()
#Democrats =1, Reps = 0

Unnamed: 0.1,Unnamed: 0,speaker,text,type,election_year,date,candidate,result,party
2123,2123,Barack Obama,"Now, if we get our tax policies right so that ...",Pres,2008.0,2008-10-07,1.0,1,1
2124,2124,John McCain,"Well, let me just follow up, my friends. If we...",Pres,2008.0,2008-10-07,1.0,0,0
2125,2125,Barack Obama,And so I do believe that we have to consider i...,Pres,2008.0,2008-10-07,1.0,1,1
2126,2126,Barack Obama,"Well, we may not always have national security...",Pres,2008.0,2008-10-07,1.0,1,1
2127,2127,Barack Obama,"Well, you know, Senator McCain, in the last de...",Pres,2008.0,2008-10-07,1.0,1,1


In [6]:
target = ['party']
data = ['text']

X = df_tr[data]
y = df_tr[target]

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)

In [15]:
# Common Parameters
max_len = 500
batch_size = 10
learning_rate = 5e-3
epochs = 1


# With Roberta-base

In [16]:
# Transformer Model
model_ = 'roberta-base'
t_mod = text.Transformer(model_, maxlen=max_len, classes = [0,1])


'''Converting split data to list [so it can processed]'''
#train
X_tr = X_train['text'].tolist()
y_tr = y_train['party'].tolist()

#test
X_ts = X_test['text'].tolist()
y_ts = y_test['party'].tolist()


# Pre-processing training & test data
train = t_mod.preprocess_train(X_tr,y_tr)
test = t_mod.preprocess_train(X_ts,y_ts)

# Model Classifier
model = t_mod.get_classifier()

learner = ktrain.get_learner(model, train_data=train, val_data=test, batch_size=batch_size)

preprocessing train...
language: en
train sequence lengths:
	mean : 73
	95percentile : 258
	99percentile : 384


Is Multi-Label? False
preprocessing train...
language: en
train sequence lengths:
	mean : 75
	95percentile : 262
	99percentile : 386


Is Multi-Label? False


In [17]:
learner.fit(learning_rate, epochs)



<keras.callbacks.History at 0x7f369b34c890>

In [18]:
x = learner.validate(class_names=t_mod.get_classes())

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       628
           1       0.65      1.00      0.78      1145

    accuracy                           0.65      1773
   macro avg       0.32      0.50      0.39      1773
weighted avg       0.42      0.65      0.51      1773



# Bert base-uncased

In [None]:
model_b = 'bert-base-uncased'
t_modb = text.Transformer(model_b, maxlen=500, classes = [0,1])


'''Converting split data to list [so it can processed]'''
#train
X_tr = X_train['text'].tolist()
y_tr = y_train['party'].tolist()

#test
X_ts = X_test['text'].tolist()
y_ts = y_test['party'].tolist()


# Pre-processing training & test data
trainb = t_modb.preprocess_train(X_tr,y_tr)
testb = t_modb.preprocess_train(X_ts,y_ts)

# Model Classifier
modelb = t_modb.get_classifier()

learnerb = ktrain.get_learner(modelb, train_data=trainb, val_data=testb, batch_size=6)

learnerb.fit(learning_rate, epochs)

x = learnerb.validate(class_names=t_modb.get_classes())


NameError: ignored

# Model = Distilbert

In [None]:
model_d = 'distilbert-base-uncased'
t_modd = text.Transformer(model_d, maxlen=500, classes = [0,1])


'''Converting split data to list [so it can processed]'''
#train
X_tr = X_train['text'].tolist()
y_tr = y_train['party'].tolist()

#test
X_ts = X_test['text'].tolist()
y_ts = y_test['party'].tolist()


# Pre-processing training & test data
traind = t_modd.preprocess_train(X_tr,y_tr)
testd = t_modd.preprocess_train(X_ts,y_ts)

# Model Classifier
modeld = t_modd.get_classifier()

learnerd = ktrain.get_learner(modeld, train_data=traind, val_data=testd, batch_size=6)

learnerd.fit(learning_rate, epochs)

x = learnerd.validate(class_names=t_modd.get_classes())

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,speaker,text,type,election_year,date,candidate,result,party
0,0,John Kennedy,I uh – said that Ive served this country for f...,Pres,1960.0,1960-10-21,1.0,1,1
1,1,John Kennedy,"Mr. Howe, Mr. Vice President. First uh – let m...",Pres,1960.0,1960-10-21,1.0,1,1
2,2,Richard Nixon,"Mr. Howe, Senator Kennedy, my fellow Americans...",Pres,1960.0,1960-10-21,1.0,0,0
3,3,John Kennedy,"Good evening, Mr. Howe.",Pres,1960.0,1960-10-21,1.0,1,1
4,4,Richard Nixon,"Good evening, Mr. Howe.",Pres,1960.0,1960-10-21,1.0,0,0
