In [339]:
import spacy 
import json 
import os
import pandas as pd
import sparknlp
from pyspark.ml import Pipeline

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import SparkSession

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import random 
from spacy.util import minibatch, compounding
from spacy.training import Example
from spacy.tokens import DocBin

print('Packages installed successfully')

Packages installed successfully


## Opening text data

In [4]:
os.chdir(r'C:\Users\sivas\ML_reddit_nlp\data_cleaned')

In [7]:
f = open('text_data.txt', 'r')
text = f.read()
text[0:2]

'US'

In [17]:
json_file_path = "training_data (2).json"

with open(json_file_path, 'r') as j:
     TRAINING_DATA = json.loads(j.read())

In [18]:
# Displaying contents
TRAINING_DATA

{'classes': ['DERIVATIVE',
  'POSITION',
  'TREND',
  'INDEX',
  'STOCK',
  'CEO',
  'REGION',
  'ORGANISATION '],
 'annotations': [['US Futures pointing lower...PANIC!!! Well shucks, I put a lot of longs of yesterday. RIP monie.China getting a bit rekt too at the open.US futures pointing down over 1.3%, and China, about 2 %.',
   {'entities': [[3, 10, 'DERIVATIVE'],
     [65, 70, 'POSITION'],
     [139, 146, 'DERIVATIVE'],
     [156, 160, 'TREND']]}],
  ["Indisputable proof that the shemitah is upon us.A while back, I read this [interesting piece] (by a guy who has a simple method for timing the market. The method is this: when the S & P 500 closes below its 300-day moving average, you sell. When it closes above its 300-day moving average, you buy. The article linked above shows some pretty impressive returns if you followed his method.But, the graphs in his article end in 2012.So, I created this graph of S & P500 closing price vs its 300-day moving average, to the present. right. It 

In [38]:
for annotation in TRAINING_DATA['annotations']:
    text, entity = annotation[0], annotation[1]
    print(text, entity, '\n')

Daily advice thread. All questions about your personal situation should be asked hereIf your question is " I have $10,000, what do I do? " or anything similar. There is no single answer to this question, but we will also need A LOT MORE information if we are to give some sort of answer* How old are you? * Are you employed/making income? How much? * What are your objectives with this money? (buy a house? Retirement savings?) * What is your risk tolerance? (Do you mind risking it at blackjack or do you need to know its 100% safe?) * What are you current holdings? (Do you already have exposure to specific funds and sectors?) * Any other assets? House paid off? Cars? Expensive girlfriend? (not really an asset) * What is your time horizon? Do you need this money next month? Next 20yrs? * Any big debts? * Any other relevant financial information will be useful to give you a proper answer.Be aware that these answers are just opinions of Redditors and should be used as a starting point for you

In [326]:
x = 34
example = TRAINING_DATA['annotations'][x][1]
print(example)

{'entities': [[2, 6, 'STOCK'], [31, 37, 'INDEX'], [866, 870, 'INDEX']]}


In [248]:
'''
x = 34
2, 6, 'STOCK'
31, 37, 'INDEX'
866, 870, 'INDEX'

'''

'\n\n'

In [312]:
TRAINING_DATA['annotations'][x][0]

"$ GILD shares are cheapGilead (NASDAQ: [GILD] (ended [2014] (with an impressive increase in its top and bottom lines. Product sales doubled, compared to the prior fiscal year, and non-GAAP earnings per share more than quadrupled. Gilead's fiscal 2015 is carrying the same momentum. The second-quarter earnings report revealed that product sales have increased significantly from the prior-year period, aided by the performance of Gilead's Hepatitis C drug Harvoni. The successful quarter led Gilead to raise its guidance for the fiscal year of 2015.Any investor that is aware of Gilead knows that one of its most successful products is the company's hepatitis C treatment Harvoni; hepatitis C medicines from Gilead account for ~60% of its revenue. Though we think the convenience and efficacy of Harvoni is simply hard-to-beat as it relates to a hep-C cure, AbbVie (NYSE: [ABBV] (offers a substitute for Gilead's treatment in its Viekira Pak. Merck (NYSE: [MRK] (and others are also active in the he

In [322]:
TRAINING_DATA['annotations'][x][0][870 : 874]

': [A'

In [323]:
TRAINING_DATA['annotations'][x][1] = {'entities': [[2, 6, 'STOCK'], [31, 37, 'INDEX'], [866, 870, 'INDEX']]}

In [324]:
TRAINING_DATA['annotations'][x]

["$ GILD shares are cheapGilead (NASDAQ: [GILD] (ended [2014] (with an impressive increase in its top and bottom lines. Product sales doubled, compared to the prior fiscal year, and non-GAAP earnings per share more than quadrupled. Gilead's fiscal 2015 is carrying the same momentum. The second-quarter earnings report revealed that product sales have increased significantly from the prior-year period, aided by the performance of Gilead's Hepatitis C drug Harvoni. The successful quarter led Gilead to raise its guidance for the fiscal year of 2015.Any investor that is aware of Gilead knows that one of its most successful products is the company's hepatitis C treatment Harvoni; hepatitis C medicines from Gilead account for ~60% of its revenue. Though we think the convenience and efficacy of Harvoni is simply hard-to-beat as it relates to a hep-C cure, AbbVie (NYSE: [ABBV] (offers a substitute for Gilead's treatment in its Viekira Pak. Merck (NYSE: [MRK] (and others are also active in the h

## Setting up the model

In [20]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.1.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [21]:
model = spacy.load('en_core_web_sm')

In [22]:
# Checking the pipes available in the model 
model.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [23]:
n_iter = 100
ner = model.get_pipe('ner')

In [26]:
# Adding in the custom ner labels to the model
for label in TRAINING_DATA['classes']:
    ner.add_label('label')

## Converting the training data to a .spacy file for training the model

In [341]:
db = DocBin() # Creating a DocBin object

# Iterate through all the annotations
for text, annotation in tqdm(TRAINING_DATA['annotations']):
    doc = model.make_doc(text)
    ents = []
    
    ner_list = annotation['entities']
    if len(ner_list) > 0:
        for start, end, label in ner_list:
            span = doc.char_span(start, end, label=label, alignment_mode='contract')
            if span is None:
                print('Skipping entity')
            else:
                ents.append(span)
    doc.ents = ents
    db.add(doc)
    
    db.to_disk('new_training_data.spacy')

 14%|█▍        | 5/35 [00:00<00:01, 29.87it/s]

Skipping entity


 49%|████▊     | 17/35 [00:00<00:00, 28.59it/s]

Skipping entity
Skipping entity


 97%|█████████▋| 34/35 [00:01<00:00, 19.77it/s]

Skipping entity
Skipping entity


100%|██████████| 35/35 [00:01<00:00, 23.24it/s]


## Creating the config file for model training

In [342]:
# .cfg file is filled in the command line

## Creating a TESTING_DATA file

In [344]:
df = pd.read_csv('reddit_data_19102021.csv')
reddit_text_data = df['text'][100:200]
reddit_text_data.head()

100    Daily advice thread. All questions about your ...
101    GS lowers oil expectations, scenario could see...
102    ELI5: Why REIT seems to be a cornerstone of a ...
103    Silver Lake Partners buys 20% of AMD  RIP SPAC...
104    GPRO, please stop!GoPro hits record low since ...
Name: text, dtype: object

In [345]:
with open('test_data.txt', 'w',encoding="utf-8") as outfile:
    reddit_text_data.to_string(outfile)

In [348]:
test = open('test_data.txt', 'r')
test_text = test.read()
test_text

'100    Daily advice thread. All questions about your ...\n101    GS lowers oil expectations, scenario could see...\n102    ELI5: Why REIT seems to be a cornerstone of a ...\n103    Silver Lake Partners buys 20% of AMD  RIP SPAC...\n104    GPRO, please stop!GoPro hits record low since ...\n105    Can you feel the YOLO? Can you FEEL it!?Well i...\n106    $MTN - Anyone considering Vail Resorts?They\'re...\n107    AVRN at it again, Pump and Dump scheme using W...\n108    Advice on Inverse ETFs and Canadian stocks to ...\n109    Wynn Resorts (WYNN)In the midst of the bad new...\n110    AMD up 8% in the last 15 minutes  Edit: looks ...\n111    I told you Mother Fuckers to get in AMD calls ...\n112    What indicators do you use?Everyone knows fund...\n113    I think i found the biggest joke of a IPO EVER...\n114    suggestions for ira investments at schwabI\'m l...\n115    [AQXP] Baker Brothers now own 36% of the compa...\n116    All ya\'ll bears \'bout to get fuckedFace it, Fe...\n117    He

## Creating .spacy file for test data

In [350]:
db2 = DocBin() # Creating a docbin object

# Iterate through all the annotations
for text in tqdm(test_text):
    doc = model.make_doc(text)
    ents = []
    
    doc.ents = ents
    db.add(doc)
    
    db.to_disk('testing_data.spacy')

100%|██████████| 5715/5715 [07:47<00:00, 12.22it/s]


## Training the model on custom labels

In [338]:
'''
# Disable all other pipes to save on computation time
other_pipes = [pipe for pipe in model.pipe_names if pipe != 'ner']  
with model.disable_pipes(*other_pipes): 
    optimizer = model.create_optimizer()
    for iteration in range(n_iter):
        random.shuffle(TRAINING_DATA['annotations'])
        losses = {}
        for raw_text, annotation in TRAINING_DATA['annotations']:
            doc = model.make_doc(raw_text)
            example = Example.from_dict(doc, annotation)
            model.update([example], sgd=optimizer)
        print(losses)
'''
# Running this command in the command line 
python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./testing_data.spacy

{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}


In [351]:
# Code to fix dll error
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"