# Import Libraries

In [1]:
import numpy as np
import os
import pandas as pd
import re
import unidecode
from sklearn.model_selection import train_test_split
import ktrain
from ktrain import text

# Load Training and Testing Dataset

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [3]:
train_data

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [4]:
test_data

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...
...,...,...
1948,9869,"#SamsungGalaxyNote7 Explodes, Burns 6-Year-Old..."
1949,9870,Now Available - Hoodie. Check it out here - ht...
1950,9871,There goes a crack right across the screen. If...
1951,9872,@codeofinterest as i said #Adobe big time we m...


# Applying Preprocessing on The Dataset

In [5]:
# define regular expression pattern for URLs, punctuation and numbers
url_pattern = re.compile(r'https?://\S+')
punctuation_pattern = re.compile(r'[^\w\s]')
digit_pattern = re.compile(r'\d')

# replace URLs and puncituations with empty string and replace digits with space
train_data['tweet'] = train_data['tweet'].replace(to_replace=[url_pattern, punctuation_pattern, digit_pattern],
                                                  value=['','',' '], regex=True)

test_data['tweet'] = test_data['tweet'].replace(to_replace=[url_pattern, punctuation_pattern, digit_pattern],
                                                  value=['','',' '], regex=True)

# remove repeated whitespaces
train_data['tweet'] = train_data['tweet'].str.replace('\s+', ' ', regex=True)
test_data['tweet'] = test_data['tweet'].str.replace('\s+', ' ', regex=True)

# remove remaining whitespaces
train_data['tweet'] = train_data['tweet'].str.strip()
test_data['tweet'] = test_data['tweet'].str.strip()

# convert text to lowercase
train_data['tweet'] = train_data['tweet'].str.lower()
test_data['tweet'] = test_data['tweet'].str.lower()

In [6]:
train_data

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally a transparant silicon case thanks to m...
2,3,0,we love this would you go talk makememories un...
3,4,0,im wired i know im george i was made that way ...
4,5,1,what amazing service apple wont even talk to m...
...,...,...,...
7915,7916,0,live out loud lol liveoutloud selfie smile son...
7916,7917,0,we would like to wish you an amazing day make ...
7917,7918,0,helping my lovely year old neighbor with her i...
7918,7919,0,finally got my smart pocket wifi stay connecte...


In [7]:
test_data

Unnamed: 0,id,tweet
0,7921,i hate the new iphone upgrade wont let me down...
1,7922,currently shitting my fucking pants apple imac...
2,7923,id like to puts some cdroms on my ipad is that...
3,7924,my ipod is officially dead i lost all my pictu...
4,7925,been fighting itunes all night i only want the...
...,...,...
1948,9869,samsunggalaxynote explodes burns yearold thank...
1949,9870,now available hoodie check it out here iphone ...
1950,9871,there goes a crack right across the screen if ...
1951,9872,codeofinterest as i said adobe big time we may...


# Getting Positive Words

In [8]:
positive = train_data[train_data['label']==0]
all_positive_words = ' '.join([text for text in positive['tweet']])

# Getting Negative Words

In [9]:
negative = train_data[train_data['label']==1]
all_negative_words = ' '.join([text for text in negative['tweet']])

# Split training data into train and validation set

In [10]:
x_train, x_valid, y_train, y_valid = train_test_split(train_data['tweet'],
                                                 train_data['label'],
                                                 test_size=0.1,
                                                 shuffle=True,
                                                 random_state=140)

In [11]:
x_train.shape

(7128,)

In [12]:
x_valid.shape

(792,)

# Load the model 'distilbert-base-uncased'

In [13]:
trans=text.Transformer('distilbert-base-uncased', maxlen=40, class_names=[0,1])

x_train=x_train.to_numpy()
y_train=y_train.to_numpy()
x_valid=x_valid.to_numpy()
y_valid=y_valid.to_numpy()

train = trans.preprocess_train(x_train, y_train)
val = trans.preprocess_test(x_valid, y_valid)
model = trans.get_classifier()
learner = ktrain.get_learner(model, train_data=train, val_data=val, batch_size=8)

preprocessing train...
language: en
train sequence lengths:
	mean : 16
	95percentile : 27
	99percentile : 35


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 15
	95percentile : 25
	99percentile : 34


# Training Model

In [14]:
learner.fit_onecycle(1e-5, 3)



begin training using onecycle policy with max lr of 1e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f1699719060>

# Testing The Model

In [15]:
predictor = ktrain.get_predictor(learner.model, preproc=trans)
test_result = pd.Series(predictor.predict(test_tweet) for test_tweet in test_data['tweet'])

# Store The Test Results

In [16]:
test_result_csv= pd.DataFrame({'id':pd.Series(test_data['id']), 'label':test_result})
test_result_csv

Unnamed: 0,id,label
0,7921,1
1,7922,1
2,7923,1
3,7924,1
4,7925,1
...,...,...
1948,9869,1
1949,9870,0
1950,9871,1
1951,9872,0


In [17]:
test_result_csv['label'].value_counts()

0    1377
1     576
Name: label, dtype: int64

# Save The Results in CSV File

In [18]:
test_result_csv.to_csv("Test_Result.csv", index=False)

# Display Metrics

In [19]:
learner.validate(class_names=trans.get_classes())

              precision    recall  f1-score   support

           0       0.97      0.93      0.95       607
           1       0.80      0.91      0.85       185

    accuracy                           0.93       792
   macro avg       0.89      0.92      0.90       792
weighted avg       0.93      0.93      0.93       792



array([[565,  42],
       [ 17, 168]])