In [1]:
# Importing required modules for Spacy text classification
import pandas as pd
import numpy as np # linear algebra
import random
import spacy
from spacy.util import minibatch, compounding

In [2]:
#Getting the data from source excel
train_df = pd.read_excel('Thinkcol_case_study.xlsx',sheet_name='train')
test_df = pd.read_excel('Thinkcol_case_study.xlsx',sheet_name='predict')

In [26]:
### Exploring the data

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1743 entries, 0 to 1742
Data columns (total 4 columns):
ID          1743 non-null int64
Mention     1743 non-null object
Target      1743 non-null int64
Category    1743 non-null object
dtypes: int64(2), object(2)
memory usage: 54.5+ KB


In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48415 entries, 0 to 48414
Data columns (total 4 columns):
id            48415 non-null int64
Message ID    48415 non-null int64
Date          48415 non-null datetime64[ns]
Mention       48385 non-null object
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 1.5+ MB


In [5]:
# Deleting the data from test where reviews are Nan
test_df.dropna(axis = 0, subset=['Mention'],inplace =True)

In [6]:
train_df['Target'].value_counts() #imbalanced dataset

0    1666
1      77
Name: Target, dtype: int64

In [7]:
#Format the dataset for use in spacy
train_df['dataset'] = train_df.apply(lambda row: (row['Mention'],row['Target']), axis=1)
train = train_df['dataset'].tolist()

In [8]:
train[0]

('moto G5 plus azul safira e com apenas 1 semana de uso o mesmo apresentou defeito o CHIP 2 não tem sinal e pela pesquisa que fiz este defeito é característico do moto g então entrei em contato com a Motorola para resolução do meu problema através do chat a mesma passou alguns procedimentos que não deram certo e então solicitaram que eu enviasse o celular para a assistência técnica que fica 400km da',
 1)

In [10]:
#helper functions to split train data into train and validation data
def load_data(limit=0, split=0.8):
    # Partition off part of the train data for evaluation
    train_data = train
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{1: bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [11]:
# Helper function for model performance evaluation metrics
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

In [15]:
# Loading the model for portugese language, traing and validation the model using train data 
def main(model=None,n_iter=20,n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label(1)

    # load the train dataset and split into train and validation set
    print("Loading train data...")
    (train_texts, train_cats), (val_texts, val_cats) = load_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(val_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                #print(annotations)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the validation data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, val_texts, val_cats)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))
    # Saving the trained model        
    output_dir = "./thinkcol_spacy_clf"
    nlp.to_disk(output_dir)

In [16]:
# Calling the main method
if __name__ == '__main__':
    main('pt_core_news_sm')

Loaded model 'pt_core_news_sm'
Loading train data...
Using 2000 examples (1394 training, 349 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
15.241	0.500	0.000	0.000
13.538	0.429	0.214	0.286
12.195	1.000	0.214	0.353
6.773	1.000	0.571	0.727
4.185	1.000	0.643	0.783
2.803	0.909	0.714	0.800
1.772	1.000	0.643	0.783
1.640	1.000	0.643	0.783
0.855	0.909	0.714	0.800
1.074	1.000	0.714	0.833
1.240	1.000	0.714	0.833
0.790	1.000	0.643	0.783
1.141	1.000	0.643	0.783
0.945	1.000	0.643	0.783
0.945	1.000	0.643	0.783
1.134	1.000	0.643	0.783
0.741	1.000	0.643	0.783
0.884	1.000	0.643	0.783
0.681	1.000	0.643	0.783
0.543	1.000	0.714	0.833


In [25]:
#Loading the saved model
output_dir = "./thinkcol_spacy_clf"
testnlp = spacy.load(output_dir)
#Format the dataset for use in spacy
test_df['dataset'] = test_df.apply(lambda row: (row['Mention']), axis=1)
test = test_df['dataset'].tolist()
# Predicting the target varible for test data
test_data = 'moto G5 plus azul safira e com apenas 1 semana de uso o mesmo apresentou defeito o CHIP 2 não tem sinal e pela pesquisa que fiz este defeito é característico do moto g então entrei em contato com a Motorola para resolução do meu problema através do chat a mesma passou alguns procedimentos que não deram certo e então solicitaram que eu enviasse o celular para a assistência técnica que fica 400km da'
testdoc = testnlp(test_data)
testdoc.cats

{1: 0.9966436624526978}