### Get the packages

In [488]:
import pandas as pd
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from datetime import datetime
from xgboost import XGBClassifier

In [489]:
def analize_sentiment(tweet):
    analysis = TextBlob((str(tweet)))
    return analysis.polarity

### Get the path for the CSV and put it in here

In [518]:
news = pd.read_csv('2018-2020_model_input.csv')

### Get the test news and training news dataset

In [491]:
def get_train_news(day, month, year):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in news['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 > d:
            dataset.append(news.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [492]:
def get_test_news(day, month, year):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in news['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 < d:
            dataset.append(news.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [493]:
num_column = (len(news.columns))
# day, month, year
train_news = get_train_news(15, 1, 2020)
test_news = get_test_news(14, 1, 2020)

In [494]:
train_news_list = []

In [495]:
for row in range (0, len(train_news.index)):
    train_news_list.append(' '.join(str(k) for k in train_news.iloc[row,12:num_column]))

### Word Frequency

In [496]:
vectorize = CountVectorizer(min_df=0.01, max_df=0.8)
news_vector = vectorize.fit_transform(train_news_list)

In [497]:
print("THE TABLE OF FREQUENCY WORD DISTRIBUTION", news_vector.shape)

THE TABLE OF FREQUENCY WORD DISTRIBUTION (511, 46)


### Base Model: Logistic Regression

In [498]:
lr = LogisticRegression()
model = lr.fit(news_vector, train_news["Label"])

In [499]:
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))

In [500]:
test_vector = vectorize.transform(test_news_list)

In [501]:
predictions = model.predict(test_vector)
pd.crosstab(test_news["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,14,83
1,14,110


In [502]:
accuracy1=accuracy_score(test_news["Label"], predictions)
print("the baseline model accuracy", accuracy1)

the baseline model accuracy 0.5610859728506787


In [503]:
words = vectorize.get_feature_names()
coefficients = model.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : words, 'Coefficient' : coefficients})

In [504]:
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
print("Top ten words according to the baseline model", coeffdf.head(10))
print("Last ten words according to the baseline model", coeffdf.tail(10))

Top ten words according to the baseline model         Word  Coefficient
39       why     1.023201
10  children     0.666932
2         an     0.641347
42   workers     0.532837
17      from     0.514270
5         as     0.504116
1     amazon     0.460666
11     china     0.448186
33      that     0.422779
22        it     0.397774
Last ten words according to the baseline model         Word  Coefficient
24       new    -0.451398
6         be    -0.454282
0      after    -0.514281
44       you    -0.690286
3        and    -0.764320
15  facebook    -0.822030
13  dealbook    -0.825163
38        up    -0.832346
23       its    -0.886823
43      york    -1.023077


### Logistic Regression with Bigram and TFID

In [505]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.90, ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)
print("TFID TRANSFORMATION DATAFRAME SHAPE", news_nvector.shape)
nmodel = lr.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range (0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)
npredictions = nmodel.predict(ntest_vector)
pd.crosstab(test_news["Label"], npredictions, rownames=["Actual"], colnames=["Predicted"])
accuracy2 = accuracy_score(test_news['Label'], npredictions)
print("Logistics Regression with Bigram and TFID", accuracy2)
nwords = nvectorize.get_feature_names()
ncoefficients = nmodel.coef_.tolist()[0]
ncoeffdf = pd.DataFrame({'Word':nwords, 'Coefficient':ncoefficients})
ncoeffdf = ncoeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
ncoeffdf.head(10)
ncoeffdf.tail(10)
print(ncoeffdf.head(10))
print(ncoeffdf.tail(10))

TFID TRANSFORMATION DATAFRAME SHAPE (511, 18)
Logistics Regression with Bigram and TFID 0.5565610859728507
               Word  Coefficient
13          in tech     1.045437
4      briefing nan     0.301139
10       google nan     0.266718
7        for google     0.263663
9         google is     0.088573
17    your thursday     0.043000
16       the google     0.036561
3   briefing google     0.030690
12      google your    -0.077724
6      facebook and    -0.202178
                 Word  Coefficient
12        google your    -0.077724
6        facebook and    -0.202178
11          google to    -0.231870
2   briefing dealbook    -0.233130
5   dealbook briefing    -0.565905
8          google and    -0.719672
14             in the    -0.784662
1          and google    -0.845605
0        and facebook    -1.053128
15           new york    -1.229554


### Random Forest

In [506]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.95, ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)

rfmodel = RandomForestClassifier(random_state = 100, criterion='entropy')
rfmodel = rfmodel.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

rfpredictions = rfmodel.predict(ntest_vector)
accuracyrf = accuracy_score(test_news["Label"], rfpredictions)
print("Random forest with tfid and bigram", accuracyrf)

Random forest with tfid and bigram 0.5656108597285068


### Naive Bayes

In [507]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.9,ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)

nbmodel = MultinomialNB(alpha=0.5)
nbmodel = nbmodel.fit(news_nvector, train_news["Label"])

test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

nbpredictions = nbmodel.predict(ntest_vector)
nbaccuracy = accuracy_score(test_news['Label'], nbpredictions)
print("Naive Bayes accuracy: ", nbaccuracy)

Naive Bayes accuracy:  0.5565610859728507


### Gradient Boosting

In [508]:
gbmodel = GradientBoostingClassifier(random_state = 52)
gbmodel = gbmodel.fit(news_nvector, train_news["Label"])

test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

gbpredictions = gbmodel.predict(ntest_vector.toarray())
gbaccuracy = accuracy_score(test_news['Label'], gbpredictions)

In [509]:
print("CONFUSION MATRIX OF THE GRADIANT BOOSTING")
print(confusion_matrix(test_news['Label'], gbpredictions))

print("Gradient boosting accuracy: ", gbaccuracy)

CONFUSION MATRIX OF THE GRADIANT BOOSTING
[[  3  94]
 [  6 118]]
Gradient boosting accuracy:  0.5475113122171946


### Trigram

In [510]:
n3vectorize = TfidfVectorizer(min_df=0.0001, max_df=0.9, ngram_range=(3,3))
news_n3vector = n3vectorize.fit_transform(train_news_list)
print(news_n3vector.shape)

n3model = lr.fit(news_n3vector, train_news["Label"])

test_news_list = []
for row in range(0, len(test_news.index)):
    
#     print("P" ,' '.join(str(x) for x in test_news.iloc[row,12:num_column]) )
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
n3test_vector = n3vectorize.transform(test_news_list)

n3predictions = n3model.predict(n3test_vector)

pd.crosstab(test_news["Label"], n3predictions, rownames=["Actual"], colnames=["Predicted"])

accuracy3 = accuracy_score(test_news['Label'], n3predictions)
print("TRIGRAM ACCURACY", accuracy3)

n3words = n3vectorize.get_feature_names()
n3coefficients = n3model.coef_.tolist()[0]
n3coeffdf = pd.DataFrame({'Word':n3words, 'Coefficient':n3coefficients})
n3coeffdf = n3coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
print("trigram top ten word distribution", n3coeffdf.head(10))
print("trigarm last ten word distribution", n3coeffdf.tail(10))

(511, 2074)
TRIGRAM ACCURACY 0.5294117647058824
trigram top ten word distribution                          Word  Coefficient
53         against google nan     0.313074
2066  your wednesday briefing     0.273938
2060     your monday briefing     0.241318
1964             week in tech     0.237216
515         employees nan nan     0.236086
254      brexit your thursday     0.229267
730        google brexit your     0.229267
1880     turkey google brexit     0.229267
816            google nan nan     0.218454
431              data nan nan     0.212612
trigarm last ten word distribution                       Word  Coefficient
138     apple hires google    -0.205112
341          chief nan nan    -0.205112
741       google chief nan    -0.205112
952     hires google chief    -0.205112
106   and google translate    -0.209435
36             ads nan nan    -0.214331
23       accuses google of    -0.216035
849     google returns nan    -0.290073
1512       returns nan nan    -0.290073
712    goo

### XGBoost

In [511]:
train_sentiment = train_news
test_sentiment = test_news
# train_sentiment = train_sentiment.drop(['Date', 'Label'], axis=1)
train_sentiment = train_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in train_sentiment:
    train_sentiment[column] = train_sentiment[column].apply(analize_sentiment)
train_sentiment = train_sentiment + 10

# test_sentiment = test_sentiment.drop(['Date', 'Label'], axis=1)
test_sentiment = test_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in test_sentiment:
    test_sentiment[column] = test_sentiment[column].apply(analize_sentiment)
test_sentiment = test_sentiment + 10

XGB_model = XGBClassifier()
gradiant = XGB_model.fit(train_sentiment, train_news['Label'])
y_pred = gradiant.predict(test_sentiment)

In [512]:
print(confusion_matrix(test_news['Label'], y_pred))
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))
print("f1_score", f1_score(test_news['Label'], y_pred, average='weighted'))
#print(y_pred)

[[  6  91]
 [  4 120]]
Sentiment Accuracy 0.5701357466063348
f1_score 0.4511960432960162


### All scores are printed out for comparison

In [513]:
print("Base model", accuracy1)
print("Logistic Regression",accuracy2)
print("Random Forest", accuracyrf)
print("Naive Bayes", nbaccuracy)
print("Gradient Boost", gbaccuracy)
print("Trigram", accuracy3)
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))

Base model 0.5610859728506787
Logistic Regression 0.5565610859728507
Random Forest 0.5656108597285068
Naive Bayes 0.5565610859728507
Gradient Boost 0.5475113122171946
Trigram 0.5294117647058824
Sentiment Accuracy 0.5701357466063348


### Attempted to plot predictions

In [514]:
# import matplotlib.pyplot as plt

# x = test_news['Date']

# plt.plot(x, test_news['Label'], label='Actual')
# plt.plot(x, y_pred, 'bo', label='XGB')
# #plt.plot(x, n3predictions, 'bo', label='Trigram')
# #plt.plot(x, gbpredictions, 'bo', label='Gradient Boost')
# #plt.plot(x, nbpredictions, 'bo', label='Naive Bayes')
# #plt.plot(x, rfpredictions, 'bo', label='Random Forest')
# #plt.plot(x, npredictions, 'bo', label='Logistic Regression')
# #plt.plot(x, predictions, 'bo', label='Base')
# plt.xticks(rotation=90)
# plt.legend()
# plt.show()