### Get the packages

In [62]:
import pandas as pd
import copy
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from datetime import datetime
from xgboost import XGBClassifier
import xgboost as xgb

In [63]:
def analize_sentiment(tweet):
    analysis = TextBlob((str(tweet)))
    return analysis.polarity

### Get the path for the CSV and put it in here 

In [64]:
news = pd.read_csv(r'C:/Users/dcard/Cap-Repo/DeepStockNLP/Data/18-20-csv/merge-18-20-input.csv')

### Get the train news and test news datasets

In [65]:
def get_train_news(day, month, year, data):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in data['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 > d:
            dataset.append(data.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [66]:
def get_test_news(day, month, year,data):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in data['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 < d:
            dataset.append(data.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [67]:
num_column = (len(news.columns))
# day, month, year
train_news = get_train_news(15, 1, 2020, news)
test_news = get_test_news(14, 1, 2020, news)

In [68]:
train_news_list = []
for row in range (0, len(train_news.index)):
    train_news_list.append(' '.join(str(k) for k in train_news.iloc[row,12:num_column]))

In [69]:
vectorize = CountVectorizer(min_df=0.01, max_df=0.8)
news_vector = vectorize.fit_transform(train_news_list)

In [70]:
print("THE TABLE OF FREQUENCY WORD DISTRIBUTION", news_vector.shape)

THE TABLE OF FREQUENCY WORD DISTRIBUTION (512, 298)


### Base Model: Logisitic Regression

In [71]:
lr = LogisticRegression()
model = lr.fit(news_vector, train_news["Label"])

In [72]:
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))

In [73]:
test_vector = vectorize.transform(test_news_list)

In [74]:
predictions = model.predict(test_vector)
pd.crosstab(test_news["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,36,71
1,35,101


In [75]:
accuracy1=accuracy_score(test_news["Label"], predictions)
print("the baseline model accuracy", accuracy1)

the baseline model accuracy 0.5637860082304527


In [76]:
words = vectorize.get_feature_names()
coefficients = model.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : words, 'Coefficient' : coefficients})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
print("Top ten words according to the baseline model", coeffdf.head(10))
print("Last ten words according to the baseline model", coeffdf.tail(10))

Top ten words according to the baseline model             Word  Coefficient
60        change     1.667834
77       digital     0.983596
142    investors     0.967956
292        years     0.927124
256       things     0.913857
191  partnership     0.873318
137        intel     0.866806
288         will     0.854402
203      project     0.813243
205           q1     0.732754
Last ten words according to the baseline model          Word  Coefficient
76   dealbook    -0.783779
238      some    -0.801978
222      says    -0.814151
286       who    -0.814469
235     sinks    -0.826197
239     space    -0.837195
168      more    -0.944323
155       llc    -0.965567
264     trade    -1.061723
145       its    -1.164049


### Random Forest

In [77]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.95, ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)

rfmodel = RandomForestClassifier(random_state = 100, criterion='entropy', max_depth=None, n_estimators=125)
rfmodel = rfmodel.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

rfpredictions = rfmodel.predict(ntest_vector)
accuracyrf = accuracy_score(test_news["Label"], rfpredictions)
print("Random forest with tfid and bigram", accuracyrf)

Random forest with tfid and bigram 0.5720164609053497


### XG Boost/Sentiment Analysis

In [78]:
train_sentiment = copy.deepcopy(train_news)
test_sentiment = copy.deepcopy(test_news)
train_news2 = copy.deepcopy(train_news)
test_news2 = copy.deepcopy(test_news)

train_sentiment = train_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in train_sentiment:
    train_sentiment[column] = train_sentiment[column].apply(analize_sentiment)
    train_news2[column] = train_sentiment[column] + 10
train_sentiment = train_sentiment + 10

test_sentiment = test_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in test_sentiment:
    test_sentiment[column] = test_sentiment[column].apply(analize_sentiment)
    test_news2[column] = test_sentiment[column] + 10  
test_sentiment = test_sentiment + 10

XGB_model = XGBClassifier(random_state=100)
gradiant = XGB_model.fit(train_sentiment, train_news['Label'])
y_pred = gradiant.predict(test_sentiment)





In [79]:
print(confusion_matrix(test_news['Label'], y_pred))
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))
print("f1_score", f1_score(test_news['Label'], y_pred, average='weighted'))
#print(y_pred)

[[ 30  77]
 [ 25 111]]
Sentiment Accuracy 0.5802469135802469
f1_score 0.546563023929279


### Model 2 - includes trends

In [80]:
train_news2 = train_news2.drop(['Date','Label'], axis = 1)
test_news2 = test_news2.drop(['Date', 'Label'], axis = 1)

XGB_model2 = XGBClassifier()
gradiant2 = XGB_model2.fit(train_news2, train_news['Label'])
y_pred2 = gradiant2.predict(test_news2)


# print(confusion_matrix(test_news['Label'], y_pred))
print("Sentiment Accuracy with Trend", accuracy_score(test_news['Label'], y_pred2))
print("f1_score", f1_score(test_news['Label'], y_pred2, average='weighted'))
# print(y_pred)

Sentiment Accuracy with Trend 0.49382716049382713
f1_score 0.4952688327331598


### Weighted XGBoost (attempted to put weight towards headlines) Vanessa

In [81]:
train_sentiment_weight = train_news
test_sentiment_weight = test_news
weighted_data=[]
empty_data=[]

train_sentiment_weight = train_sentiment_weight.drop(['Date', 'Label'], axis=1)
for column in train_sentiment_weight:
    train_sentiment_weight[column] = train_sentiment_weight[column].apply(analize_sentiment)
train_sentiment_weight = train_sentiment_weight + 10

test_sentiment_weight = test_sentiment_weight.drop(['Date', 'Label'], axis=1)
for column in test_sentiment_weight:
    test_sentiment_weight[column] = test_sentiment_weight[column].apply(analize_sentiment)
test_sentiment_weight = test_sentiment_weight + 10

for column in train_news:
    if not train_news[column].empty:
        empty_data = train_news[column]
    else:
        weighted_data = train_news[column]

weighted_XGB = XGBClassifier()
weighted_XGB.fit(train_sentiment_weight, train_news['Label'], sample_weight=weighted_data)
y_pred_weight = weighted_XGB.predict(test_sentiment_weight, ntree_limit=500)



In [82]:
print("Weighted Accuracy", accuracy_score(test_news['Label'], y_pred_weight))
print("F1 weighted", f1_score(test_news['Label'], y_pred_weight, average='weighted'))

Weighted Accuracy 0.5802469135802469
F1 weighted 0.546563023929279


### Weighted attempt 2 (Vans)

In [83]:
train_sentiment_weight = train_news
test_sentiment_weight = test_news
weighted_data=[]
empty_data=[]

train_sentiment_weight = train_sentiment_weight.drop(['Date', 'Label'], axis=1)
for column in train_sentiment_weight:
    train_sentiment_weight[column] = train_sentiment_weight[column].apply(analize_sentiment)
train_sentiment_weight = train_sentiment_weight + 10

test_sentiment_weight = test_sentiment_weight.drop(['Date', 'Label'], axis=1)
for column in test_sentiment_weight:
    test_sentiment_weight[column] = test_sentiment_weight[column].apply(analize_sentiment)
test_sentiment_weight = test_sentiment_weight + 10

for column in train_news:
    if not train_news[column].empty:
        empty_data = train_news[column]
    else:
        weighted_data = train_news[column]

weighted_XGB1 = XGBClassifier(scale_pos_weight = 60)
weighted_XGB1.fit(train_sentiment_weight, train_news['Label'], sample_weight = weighted_data)
y_pred_weight1 = weighted_XGB1.predict(test_sentiment_weight)



In [84]:
print("Weighted Accuracy", accuracy_score(test_news['Label'], y_pred_weight1))
print("F1 weighted", f1_score(test_news['Label'], y_pred_weight1, average='weighted'))

Weighted Accuracy 0.5679012345679012
F1 weighted 0.4769834433917121


### Weighted - using trends (Erika)

In [85]:
def get_train_weight(train_sentiment):
    train_weight = []

    for across in range(len(train_sentiment)):
        for i in range(500):
            train_weight.append(0.5)
        for x in train_sentiment:
            if x not in ['Date', 'Label', '1', '2','3','4','5','6','7','8','9', '10'] :
                if train_sentiment[x][across] == 10.0:
                    train_weight.append(0.5)
                else:
                    train_weight.append(0.8)
    return train_weight

In [86]:
def get_test_weight(train_sentiment, test_sentiment):

    test_weight = []
    j = len(train_sentiment)
    for across in range(len(test_sentiment)):
        for i in range(500):
            test_weight.append(0.5)
        for x in test_sentiment:
            if x not in ['Date', 'Label', '1', '2','3','4','5','6','7','8','9', '10'] :
                if test_sentiment[x][j] == 10.0:
                    test_weight.append(0.5)
                else:
                    test_weight.append(0.8)
        j = j + 1

In [87]:
train_weight = get_train_weight(train_sentiment)
test_weight = get_test_weight(train_sentiment, test_sentiment)

XGB_model4 = XGBClassifier()

num_round = 2
param = {'max_depth' : 2, 'eta': 1, 'objective':'binary:logistic' }
gradiant4 = XGB_model4.fit(train_news2, train_news['Label'])
y_pred4 = gradiant4.predict(test_news2)

print("Sentiment Accuracy with Trend with weight", accuracy_score(test_news['Label'], y_pred4))
print("f1_score", f1_score(test_news['Label'], y_pred4, average='weighted'))


Sentiment Accuracy with Trend with weight 0.49382716049382713
f1_score 0.4952688327331598


In [88]:
print("Sentiment Weighted Accuracy 2", accuracy_score(test_news['Label'], y_pred_weight))
print("F1 weighted", f1_score(test_news['Label'], y_pred_weight, average='weighted'))

Sentiment Weighted Accuracy 2 0.5802469135802469
F1 weighted 0.546563023929279


## Individual model for prices and headlines

### Prices Model

In [89]:
head_cols = train_sentiment.columns

price_train = train_news.drop(head_cols, axis = 1)
price_train = price_train.drop(['Date', 'Label'], axis =1)

price_test = test_news.drop(head_cols, axis = 1)
price_test = price_test.drop(['Date', 'Label'], axis =1)

dtrain = xgb.DMatrix( data = price_train, label = train_news['Label'])
dtest = xgb.DMatrix(data = price_test, label = test_news['Label'])
params  = dict(max_depth=2, eta=1, verbose=0, nthread=2, eval_metric = "auc",
              objective="binary:logistic")

m = xgb.train(params, dtrain)

y_price = m.predict(dtest)

i = 0
y = []
for p in y_price:
    if p > 0.49:
        y.append(1)
    else:
        y.append(0)
    i = i + 1
print("Sentiment Accuracy with Trend with weight", accuracy_score(test_news['Label'], y))
print("f1_score", f1_score(test_news['Label'], y, average='weighted'))

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Sentiment Accuracy with Trend with weight 0.4567901234567901
f1_score 0.45651413957424886


### Headlines Model

In [90]:
head_cols = train_sentiment.columns

head_train = copy.deepcopy(train_sentiment)

head_test = copy.deepcopy(test_sentiment)

dtrain2 = xgb.DMatrix( data = head_train, label = train_news['Label'])
dtest2 = xgb.DMatrix(data = head_test, label = test_news['Label'])
params2  = dict(max_depth=2, eta=1, verbose=0, nthread=2, eval_metric = "auc",
              objective="binary:logistic")

m = xgb.train(params2, dtrain2)

y_head = m.predict(dtest2)

i = 0
y_h = []
for p in y_head:
    if p > 0.49:
        y_h.append(1)
    else:
        y_h.append(0)
    i = i + 1
print("Sentiment Accuracy with Trend with weight", accuracy_score(test_news['Label'], y_h))
print("f1_score", f1_score(test_news['Label'], y_h, average='weighted'))

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Sentiment Accuracy with Trend with weight 0.5761316872427984
f1_score 0.5332367592662777


### combined models

In [91]:
import numpy as np
df_price = pd.DataFrame(y_price, columns = ["Price"])
df_head = pd.DataFrame(y_head, columns = ["Head"])

date = test_news.drop(['1','2','3','4','5','6','7','8','9','10'], axis = 1)
date = date.drop(test_sentiment.columns, axis = 1)
date.index = np.arange(0, len(date))


df_date = pd.DataFrame(date, columns = ["Date", "Label"])
df1 = df_date.join(df_price)
df2= df1.join(df_head)

train_date = df2['Date'][(int(len(df2) * .7))]
year = train_date[0:4]
month = train_date[5:7]

day = train_date[8:10]
test_date =  df2['Date'][(int(len(df2) * .7)-1)]
 
tst_day =  test_date[8:10]
tst_month = test_date[5:7]
tst_year =test_date[0:4]

#day, month year
train_d = get_train_news(int(day), int(month) , int(year), df2)

test_d = get_test_news(int(tst_day), int(tst_month), int(tst_year), df2)

2020-09-16


In [92]:
# print(price_train)
w = [0.10, 0.90]
tr_d = train_d.drop(['Date', 'Label'], axis = 1)
tst_d = test_d.drop(['Date', 'Label'], axis =1)

dtrain3 = xgb.DMatrix( data = tr_d, label = train_d['Label'], feature_names = ['Price', 'Head'])

dtrain3.set_info(feature_weights = w)

dtest3 = xgb.DMatrix(data = tst_d, label = test_d['Label'],  feature_names = ['Price', 'Head'])

dtest3.set_info(feature_weights = w)

params3  = dict(max_depth=2, eta=1, verbose=0, nthread=2, eval_metric = "auc",
              objective="binary:logistic")
m1 = xgb.train(params2, dtrain3)

y_final = m1.predict(dtest3)
print(y_final)
i = 0
y_new = []
for p in y_final:
    if p > 0.49:
        y_new.append(1)
    else:
        y_new.append(0)
    i = i + 1

print("Sentiment Accuracy with Trend with weight", accuracy_score(test_d['Label'], y_new))
print("f1_score", f1_score(test_d['Label'], y_new, average='weighted'))

Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0.70032674 0.70032674 0.4180513  0.4180513  0.21547507 0.20682396
 0.31626812 0.5106079  0.8311066  0.85183805 0.87992615 0.47649378
 0.7561273  0.70032674 0.21923788 0.87710625 0.62784576 0.62784576
 0.89351994 0.45211    0.42100504 0.7561273  0.27939972 0.21065919
 0.51405436 0.9023486  0.7152642  0.2948649  0.4180513  0.9623284
 0.70032674 0.3604483  0.87992615 0.87710625 0.47649378 0.27939972
 0.70032674 0.37090537 0.37090537 0.47649378 0.62784576 0.3783967
 0.8311066  0.43158674 0.87992615 0.4180513  0.27939972 0.7294282
 0.37934357 0.70032674 0.62784576 0.36210492 0.4180513  0.87992615
 0.7152642  0.3783967  0.62784576 0.7152642  0.42100504 0.3783967
 0.70032674 0.610785   0.89351994 0.72015625 0.4180

### All scores are printed out for comparison

In [93]:
print("Base model", accuracy1)
print("Random Forest", accuracyrf)
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))
print("Sentiment Accuracy with Trends", accuracy_score(test_news['Label'], y_pred2))
print("Weighted Accuracy (Vans)", accuracy_score(test_news['Label'], y_pred_weight))
print("Sentiment Accuracy with Trend with weight (Erika)", accuracy_score(test_news['Label'], y_pred4))
print("Weighted accuracy (Vans) attempt 2", accuracy_score(test_news['Label'], y_pred_weight1))
print("Sentiment Accuracy (seperate models)", accuracy_score(test_d['Label'], y_new))

Base model 0.5637860082304527
Random Forest 0.5720164609053497
Sentiment Accuracy 0.5802469135802469
Sentiment Accuracy with Trends 0.49382716049382713
Weighted Accuracy (Vans) 0.5802469135802469
Sentiment Accuracy with Trend with weight (Erika) 0.49382716049382713
Weighted accuracy (Vans) attempt 2 0.5679012345679012
Sentiment Accuracy (seperate models) 0.6301369863013698
