### Get the packages

In [1]:
import pandas as pd
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from datetime import datetime
from xgboost import XGBClassifier

In [2]:
def analize_sentiment(tweet):
    analysis = TextBlob((str(tweet)))
    return analysis.polarity

### Get the path for the CSV and put it in here

In [3]:
news = pd.read_csv("/Users/vanessahoang/documents/deepstocknlp/data/18-20-csv/2018-2020_model_input.csv")

### Get the test news and training news dataset

In [4]:
def get_train_news(day, month, year):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in news['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 > d:
            dataset.append(news.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [5]:
def get_test_news(day, month, year):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in news['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 < d:
            dataset.append(news.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [274]:
num_column = (len(news.columns))
# day, month, year
train_news = get_train_news(15, 1, 2018)
test_news = get_test_news(14, 1, 2018)

In [275]:
train_news_list = []

In [276]:
for row in range (0, len(train_news.index)):
    train_news_list.append(' '.join(str(k) for k in train_news.iloc[row,12:num_column]))

### Word Frequency

In [277]:
vectorize = CountVectorizer(min_df=0.01, max_df=0.8)
news_vector = vectorize.fit_transform(train_news_list)

In [278]:
print("THE TABLE OF FREQUENCY WORD DISTRIBUTION", news_vector.shape)

THE TABLE OF FREQUENCY WORD DISTRIBUTION (8, 18)


### Base Model: Logistic Regression

In [279]:
lr = LogisticRegression()
model = lr.fit(news_vector, train_news["Label"])

In [280]:
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))

In [281]:
test_vector = vectorize.transform(test_news_list)

In [282]:
predictions = model.predict(test_vector)
pd.crosstab(test_news["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,1
Actual,Unnamed: 1_level_1
0,344
1,380


In [283]:
accuracy1=accuracy_score(test_news["Label"], predictions)
print("the baseline model accuracy", accuracy1)

the baseline model accuracy 0.5248618784530387


In [284]:
words = vectorize.get_feature_names()
coefficients = model.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : words, 'Coefficient' : coefficients})

In [285]:
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
print("Top ten words according to the baseline model", coeffdf.head(10))
print("Last ten words according to the baseline model", coeffdf.tail(10))

Top ten words according to the baseline model         Word  Coefficient
9     google     0.506022
17      with     0.363446
4   changing     0.190605
6      codes     0.190605
8    country     0.190605
1     albums     0.172840
12    online     0.172840
13    photos     0.172840
14   sharing     0.172840
0    against     0.142577
Last ten words according to the baseline model             Word  Coefficient
14       sharing     0.172840
0        against     0.142577
2         author     0.142577
3           bias     0.142577
5       claiming     0.142577
7   conservative     0.142577
10          memo     0.142577
11           men     0.142577
15          sues     0.142577
16         white     0.142577


### Logistic Regression with Bigram and TFID

In [286]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.90, ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)
print("TFID TRANSFORMATION DATAFRAME SHAPE", news_nvector.shape)
nmodel = lr.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range (0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)
npredictions = nmodel.predict(ntest_vector)
pd.crosstab(test_news["Label"], npredictions, rownames=["Actual"], colnames=["Predicted"])
accuracy2 = accuracy_score(test_news['Label'], npredictions)
print("Logistics Regression with Bigram and TFID", accuracy2)
nwords = nvectorize.get_feature_names()
ncoefficients = nmodel.coef_.tolist()[0]
ncoeffdf = pd.DataFrame({'Word':nwords, 'Coefficient':ncoefficients})
ncoeffdf = ncoeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
ncoeffdf.head(10)
ncoeffdf.tail(10)
print(ncoeffdf.head(10))
print(ncoeffdf.tail(10))

TFID TRANSFORMATION DATAFRAME SHAPE (8, 20)
Logistics Regression with Bigram and TFID 0.5248618784530387
                Word  Coefficient
19       with google     0.237026
4   changing country     0.148222
6         codes with     0.148222
8      country codes     0.148222
10        google nan     0.148222
1        albums with     0.134599
11     google photos     0.134599
14     online albums     0.134599
15        photos nan     0.134599
16    sharing online     0.134599
                  Word  Coefficient
0        against white     0.104129
2          author sues     0.104129
3         bias against     0.104129
5        claiming bias     0.104129
7     conservative men     0.104129
9          google memo     0.104129
12         memo author     0.104129
13             men nan     0.104129
17       sues claiming     0.104129
18  white conservative     0.104129


### Random Forest

In [287]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.95, ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)

rfmodel = RandomForestClassifier(random_state = 100, criterion='entropy')
rfmodel = rfmodel.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

rfpredictions = rfmodel.predict(ntest_vector)
accuracyrf = accuracy_score(test_news["Label"], rfpredictions)
print("Random forest with tfid and bigram", accuracyrf)

Random forest with tfid and bigram 0.4723756906077348


### Naive Bayes

In [288]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.9,ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)

nbmodel = MultinomialNB(alpha=0.5)
nbmodel = nbmodel.fit(news_nvector, train_news["Label"])

test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

nbpredictions = nbmodel.predict(ntest_vector)
nbaccuracy = accuracy_score(test_news['Label'], nbpredictions)
print("Naive Bayes accuracy: ", nbaccuracy)

Naive Bayes accuracy:  0.5248618784530387


### Gradient Boosting

In [289]:
gbmodel = GradientBoostingClassifier(random_state = 52)
gbmodel = gbmodel.fit(news_nvector, train_news["Label"])

test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

gbpredictions = gbmodel.predict(ntest_vector.toarray())
gbaccuracy = accuracy_score(test_news['Label'], gbpredictions)

In [290]:
print("CONFUSION MATRIX OF THE GRADIANT BOOSTING")
print(confusion_matrix(test_news['Label'], gbpredictions))

print("Gradient boosting accuracy: ", gbaccuracy)

CONFUSION MATRIX OF THE GRADIANT BOOSTING
[[343   1]
 [378   2]]
Gradient boosting accuracy:  0.47651933701657456


### Trigram

In [291]:
n3vectorize = TfidfVectorizer(min_df=0.0001, max_df=0.9, ngram_range=(3,3))
news_n3vector = n3vectorize.fit_transform(train_news_list)
print(news_n3vector.shape)

n3model = lr.fit(news_n3vector, train_news["Label"])

test_news_list = []
for row in range(0, len(test_news.index)):
    
#     print("P" ,' '.join(str(x) for x in test_news.iloc[row,12:num_column]) )
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
n3test_vector = n3vectorize.transform(test_news_list)

n3predictions = n3model.predict(n3test_vector)

pd.crosstab(test_news["Label"], n3predictions, rownames=["Actual"], colnames=["Predicted"])

accuracy3 = accuracy_score(test_news['Label'], n3predictions)
print("TRIGRAM ACCURACY", accuracy3)

n3words = n3vectorize.get_feature_names()
n3coefficients = n3model.coef_.tolist()[0]
n3coeffdf = pd.DataFrame({'Word':n3words, 'Coefficient':n3coefficients})
n3coeffdf = n3coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
print("trigram top ten word distribution", n3coeffdf.head(10))
print("trigarm last ten word distribution", n3coeffdf.tail(10))

(8, 21)
TRIGRAM ACCURACY 0.5248618784530387
trigram top ten word distribution                       Word  Coefficient
4   changing country codes     0.146528
6        codes with google     0.146528
8       country codes with     0.146528
10          google nan nan     0.146528
19         with google nan     0.146528
1       albums with google     0.133761
11       google photos nan     0.133761
14      online albums with     0.133761
15          photos nan nan     0.133761
16   sharing online albums     0.133761
trigarm last ten word distribution                           Word  Coefficient
0   against white conservative     0.103611
2         author sues claiming     0.103611
3           bias against white     0.103611
5        claiming bias against     0.103611
7         conservative men nan     0.103611
9           google memo author     0.103611
12            memo author sues     0.103611
13                 men nan nan     0.103611
17          sues claiming bias     0.103611
18     

### XGBoost

In [None]:
train_sentiment = train_news
test_sentiment = test_news
# train_sentiment = train_sentiment.drop(['Date', 'Label'], axis=1)
train_sentiment = train_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in train_sentiment:
    train_sentiment[column] = train_sentiment[column].apply(analize_sentiment)
train_sentiment = train_sentiment + 10

# test_sentiment = test_sentiment.drop(['Date', 'Label'], axis=1)
test_sentiment = test_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in test_sentiment:
    test_sentiment[column] = test_sentiment[column].apply(analize_sentiment)
test_sentiment = test_sentiment + 10

XGB_model = XGBClassifier()
gradiant = XGB_model.fit(train_sentiment, train_news['Label'])
y_pred = gradiant.predict(test_sentiment)

In [None]:
print(confusion_matrix(test_news['Label'], y_pred))
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))
print("f1_score", f1_score(test_news['Label'], y_pred, average='weighted'))
print(y_pred)

### All scores are printed out for comparison

In [None]:
print("Base model", accuracy1)
print("Logistic Regression",accuracy2)
print("Random Forest", accuracyrf)
print("Naive Bayes", nbaccuracy)
print("Gradient Boost", gbaccuracy)
print("Trigram", accuracy3)
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))

### Attempted to plot predictions

In [None]:
# import matplotlib.pyplot as plt

# x = test_news['Date']

# plt.plot(x, test_news['Label'], label='Actual')
# plt.plot(x, y_pred, 'bo', label='XGB')
# #plt.plot(x, n3predictions, 'bo', label='Trigram')
# #plt.plot(x, gbpredictions, 'bo', label='Gradient Boost')
# #plt.plot(x, nbpredictions, 'bo', label='Naive Bayes')
# #plt.plot(x, rfpredictions, 'bo', label='Random Forest')
# #plt.plot(x, npredictions, 'bo', label='Logistic Regression')
# #plt.plot(x, predictions, 'bo', label='Base')
# plt.xticks(rotation=90)
# plt.legend()
# plt.show()