# Load Libraries

In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [3]:
import spacy

In [34]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel("Data_Test.xlsx")

# Split into train and val set

In [39]:
from sklearn.model_selection import train_test_split

train1,test1 = train_test_split(train,test_size=0.2)

# Build model

In [40]:
#initialize blank nlp
nlp = spacy.blank('en')

In [41]:
# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [42]:
# Add labels to text classifier
textcat.add_label("0")
textcat.add_label("1")
textcat.add_label("2")
textcat.add_label("3")

1

In [43]:
train_texts = train1['STORY'].values
train_labels = [{'cats': {'0' : label == 0,
                          '1': label == 1,
                          '2': label == 2,
                         '3': label == 3,}} 
                for label in train1['SECTION']]


In [44]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Another medical equipment company, Omron, has developed a blood pressure tracker that can be worn on the wrist. According to a Research and Markets report, the wearable medical devices market is expected to reach $14.41 billion by 2022 from $6.22 billion in 2017.\n\n\nGaming smartphones: The gaming industry generated revenue of $15.5 billion in 2018 and smartphone gaming accounted for 30% of this, claims market research firm Statista. To tap into this growing market, phone companies are developing smartphones meant specifically for gaming. For instance, the Razer Phone 2 has a 120Hz display, which means the frames will update faster compared to regular smartphone’s display, resulting in a more fluid gaming experience.\n\n\nGaming phones also come with accessories meant to enhance gaming, but what’s missing so far are games that are meant for these. In 2019, we will hopefully start seeing developers take interest and make games that can take advantage of the features gaming phones br

In [45]:
#train for one epoch
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) but we need to
    # send separate lists for texts and labels to update().
    # This is a quick way to split a list of tuples into lists
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

In [108]:
#train for more epochs
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 0.0727414868509353}
{'textcat': 0.1082060051094964}
{'textcat': 0.1424010076583161}
{'textcat': 0.17483801104697957}
{'textcat': 0.2088904692358099}
{'textcat': 0.24111218872345974}
{'textcat': 0.2742818808966178}
{'textcat': 0.30777039759574626}
{'textcat': 0.3402137664455353}
{'textcat': 0.37480958039822937}
{'textcat': 0.4070982793819986}
{'textcat': 0.44029202071194407}


# Make prediction for validation

In [109]:
test2 = list(test1.STORY.values)

In [110]:
docs = [nlp.tokenizer(text) for text in test2]

In [111]:
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[1.39297059e-04 9.99787033e-01 1.40561305e-08 7.36587317e-05]
 [1.24183467e-17 1.53164447e-15 1.00000000e+00 4.16316093e-20]
 [5.69504984e-02 9.11339641e-01 9.35494882e-07 3.17089558e-02]
 ...
 [9.41736857e-07 9.99993205e-01 3.15500948e-15 5.82535586e-06]
 [1.33231097e-07 1.20212080e-08 9.99999881e-01 2.14986851e-09]
 [8.93609298e-10 1.00000000e+00 1.06844633e-12 6.00573688e-11]]


In [88]:
import numpy as np

In [112]:
test1.shape

(1526, 2)

In [113]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
pred = [textcat.labels[label] for label in predicted_labels]

In [114]:
pred = [int(te) for te in pred]

In [115]:
y = list(test1.SECTION.values)

In [116]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y,pred)

In [64]:
cm

array([[310,   7,   6,   0],
       [  7, 560,   3,   4],
       [  1,   0, 382,   0],
       [  4,   7,   0, 235]], dtype=int64)

In [93]:
cm1 = confusion_matrix(y,pred)

In [94]:
cm1

array([[309,   4,   8,   2],
       [  6, 563,   0,   5],
       [  0,   1, 382,   0],
       [  1,   6,   0, 239]], dtype=int64)

In [95]:
from sklearn.metrics import accuracy_score

accuracy_score(y,pred)

0.9783748361730014

In [70]:
sub = pd.read_excel("Sample_submission.xlsx")

In [71]:
sub.head()

Unnamed: 0,SECTION
0,3
1,3
2,3
3,3
4,3


In [96]:
test3 = list(test.STORY.values)

In [97]:
doc1 = [nlp.tokenizer(text) for text in test3]

In [98]:
score, _ = textcat.predict(doc1)

print(score)

[[2.3126749e-24 1.0000000e+00 1.4219341e-27 6.8042236e-26]
 [4.9611945e-03 5.8653387e-03 9.8878968e-01 3.8375531e-04]
 [3.8559435e-25 1.0000000e+00 7.0478273e-25 5.7839805e-31]
 ...
 [9.0674794e-12 1.0000000e+00 4.4576186e-13 2.1177049e-13]
 [1.1190441e-01 5.2530664e-01 1.1336950e-10 3.6278889e-01]
 [6.5779767e-04 9.9921095e-01 2.7419554e-07 1.3103115e-04]]


In [99]:
predicted_label = score.argmax(axis=1)
pred1 = [textcat.labels[label] for label in predicted_label]

In [100]:
pred1 = [int(pr) for pr in pred1]

In [105]:
submission = pd.DataFrame(pred1)

In [106]:
submission = submission.rename(columns={0:'SECTION'})

In [107]:
submission.to_excel('submission1.xlsx',index=False)