In [182]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import random
import re

In [200]:
df = pd.read_csv('data_labeling/labelled_data_300.csv')

In [206]:
df.rename(columns={'text': 'tweet', 'relevent_or_not':'label'}, inplace=True)

In [208]:
df['label'] = df['label'].eq('Relevant').mul(1)

In [210]:
pattern = '[0-9]*,'
def find_patt(row):
    'Removes the id from start of the string'
    new = re.sub(pattern, '', row)
    return new
df['tweet'] = df['tweet'].apply(find_patt)

In [211]:
nlp = spacy.load("en_core_web_md")

In [212]:
text_cat=nlp.create_pipe( "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
nlp.add_pipe(text_cat, last=True)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

In [213]:
text_cat.add_label("Relevant")
text_cat.add_label("Not_Relevant")

1

In [215]:
df['tuples'] = df.apply(lambda row: (row['tweet'], row['label']), axis=1)

In [216]:
train = df['tuples'].tolist()

In [217]:
train[:5]

[('RT JRNYcrypto THE CRYPTO GIVEAWAY CONTEST IS NOW LIVE 25K IN CRYPTO PRIZES Prizes 10 winners of 1000 of ETH 15 winners',
  0),
 ('frogghie holystreem QuantStratTradR PPathole Elon Musk has been causing a lot of pain to Tesla s investors For how long Elon Musk will continue on inflicting pain to Tesla s investors Elon get rid of Bitcoin today And investors around the world will do the cheer up dance Jerusalem for you and Tesla',
  1),
 ('Bitcoin Black v Bitcoin Get FREE coins 36 value https t co mNut0IabRp', 0),
 ('jasonkendal5 elonmusk Thanks for your support I have a special gift for you Visit www elon win com remove slashes doge btc eth bitcoin ethereum dogecoin',
  0),
 ('RT TonySparkOG Check out my Cryptocurrency guides playlist on YouTube to get some great value for free Guides https t co sArKXBTFlx',
  0)]

In [218]:
def load_data(limit=0, split=0.8):
    train_data=train
    # Shuffle the data
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    # get the categories for each review
    cats = [{"Relevant": bool(y), "Not_Relevant": not bool(y)} for y in labels]

    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [219]:

# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=len(df))

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:10]

[('RT shortthebanks Want a chance to win 50 in Bitcoin Just retweet and tag someone to enter 1 Winner will be picked in 7 days solar',
  {'cats': {'Relevant': False, 'Not_Relevant': True}}),
 ('You have heard about Bitcoin and TSLA but missed your chance anon Mmm do not worry sir I got you covered White paper will be released next week I got my team ready Only need a small amount of 100M and the token launch shall be successful https t co l7ffhENdQe',
  {'cats': {'Relevant': True, 'Not_Relevant': False}}),
 ('If you re considering crypto now do check out the latokens app on PlayStore amp AppStore Clean interface and pretty high rated You can also earn free crypto in the ZIL giveaway https t co w4iNMET940 LatokenGiveaway LatokenApp Bitcoin',
  {'cats': {'Relevant': False, 'Not_Relevant': True}}),
 ('RT FOUR20Finance NEW Airdrop running for FOUR20Finance FIRST 500 people to join https t co 6RAsTHS7sR will receive our new BinanceS',
  {'cats': {'Relevant': False, 'Not_Relevant': True}}),


In [220]:
from spacy.util import minibatch, compounding

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        print(f"Losses at iteration {i}", losses)


Losses at iteration 0 {'textcat': 1.2384182746582155}
Losses at iteration 1 {'textcat': 0.4856345720027093}
Losses at iteration 2 {'textcat': 0.23281130998245292}
Losses at iteration 3 {'textcat': 0.12064107727380835}
Losses at iteration 4 {'textcat': 0.1398355695296274}
Losses at iteration 5 {'textcat': 0.12926229437135678}
Losses at iteration 6 {'textcat': 0.12983068492017796}
Losses at iteration 7 {'textcat': 0.10173017108903035}
Losses at iteration 8 {'textcat': 0.10330081909559327}
Losses at iteration 9 {'textcat': 0.13089422827849906}


In [227]:
#test_text="Central Bank of Kenya to Reportedly Use Bitcoin as Reserve Currency"
doc=nlp(test_text)
doc.cats

{'Relevant': 2.699279377793573e-07, 'Not_Relevant': 0.9999997615814209}

In [226]:
test_text = 'RT Money earn Fi KYFI Airdrop 1 KYFI 300 USD For Completing the tasks Get 0 1 KYFI 30 For Each Valid Refer Get 0 05 KYFI'