### Get the packages

In [253]:
import pandas as pd
import copy
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from datetime import datetime
from xgboost import XGBClassifier

In [254]:
def analize_sentiment(tweet):
    analysis = TextBlob((str(tweet)))
    return analysis.polarity

### Get the path for the CSV and put it in here 

In [255]:
news = pd.read_csv('J-F-N-input.csv')

### Get the train news and test news datasets

In [256]:
def get_train_news(day, month, year):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in news['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 > d:
            dataset.append(news.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [257]:
def get_test_news(day, month, year):
    index = 0
    dataset = []
    d1 = datetime(year, month, day).date() 
    for date in news['Date']:
        d = datetime.strptime(date, '%Y-%m-%d').date()
        if d1 < d:
            dataset.append(news.iloc[index])
        index = index +1
        df = pd.DataFrame(dataset)
    return df

In [258]:
num_column = (len(news.columns))
# day, month, year
train_news = get_train_news(8, 2, 2019)
test_news = get_test_news(7, 2, 2019)

In [259]:
train_news_list = []
for row in range (0, len(train_news.index)):
    train_news_list.append(' '.join(str(k) for k in train_news.iloc[row,12:num_column]))

In [260]:
vectorize = CountVectorizer(min_df=0.01, max_df=0.8)
news_vector = vectorize.fit_transform(train_news_list)

In [261]:
print("THE TABLE OF FREQUENCY WORD DISTRIBUTION", news_vector.shape)

THE TABLE OF FREQUENCY WORD DISTRIBUTION (26, 43)


### Base Model: Logisitic Regression

In [262]:
lr = LogisticRegression()
model = lr.fit(news_vector, train_news["Label"])

In [263]:
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))

In [264]:
test_vector = vectorize.transform(test_news_list)

In [265]:
predictions = model.predict(test_vector)
pd.crosstab(test_news["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0
Actual,Unnamed: 1_level_1
0,6
1,7


In [266]:
accuracy1=accuracy_score(test_news["Label"], predictions)
print("the baseline model accuracy", accuracy1)

the baseline model accuracy 0.46153846153846156


In [267]:
words = vectorize.get_feature_names()
coefficients = model.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : words, 'Coefficient' : coefficients})
coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0,1])
print("Top ten words according to the baseline model", coeffdf.head(10))
print("Last ten words according to the baseline model", coeffdf.tail(10))

Top ten words according to the baseline model         Word  Coefficient
14      have     0.222310
17      made     0.222310
20     money     0.222310
21      much     0.222310
37     today     0.222310
42       you     0.222310
6     demise     0.208729
18      maps     0.208729
19  marriage     0.208729
23        my     0.208729
Last ten words according to the baseline model          Word  Coefficient
35       tech    -0.171197
40       week    -0.171197
0     accused    -0.189059
2       board    -0.189059
9   executive    -0.189059
10       exit    -0.189059
27       over    -0.189059
28    package    -0.189059
33       sued    -0.189059
1         and    -0.301376


### Random Forest

In [268]:
nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.95, ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)

rfmodel = RandomForestClassifier(random_state = 100, criterion='entropy', max_depth=None, n_estimators=125)
rfmodel = rfmodel.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range(0, len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:num_column]))
ntest_vector = nvectorize.transform(test_news_list)

rfpredictions = rfmodel.predict(ntest_vector)
accuracyrf = accuracy_score(test_news["Label"], rfpredictions)
print("Random forest with tfid and bigram", accuracyrf)

Random forest with tfid and bigram 0.46153846153846156


### XG Boost/Sentiment Analysis

In [269]:
train_sentiment = copy.deepcopy(train_news)
test_sentiment = copy.deepcopy(test_news)
train_news2 = copy.deepcopy(train_news)
test_news2 = copy.deepcopy(test_news)

train_sentiment = train_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in train_sentiment:
    train_sentiment[column] = train_sentiment[column].apply(analize_sentiment)
    train_news2[column] = train_sentiment[column] + 10
train_sentiment = train_sentiment + 10

test_sentiment = test_sentiment.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in test_sentiment:
    test_sentiment[column] = test_sentiment[column].apply(analize_sentiment)
    test_news2[column] = test_sentiment[column] + 10  
test_sentiment = test_sentiment + 10

XGB_model = XGBClassifier(random_state=100, criterion='entropy')
gradiant = XGB_model.fit(train_sentiment, train_news['Label'])
y_pred = gradiant.predict(test_sentiment)

Parameters: { criterion } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.






In [270]:
print(confusion_matrix(test_news['Label'], y_pred))
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))
print("f1_score", f1_score(test_news['Label'], y_pred, average='weighted'))
#print(y_pred)

[[6 0]
 [7 0]]
Sentiment Accuracy 0.46153846153846156
f1_score 0.291497975708502


In [271]:
# Model 2 - includes trends

In [272]:
train_news2 = train_news2.drop(['Date','Label'], axis = 1)
test_news2 = test_news2.drop(['Date', 'Label'], axis = 1)

XGB_model2 = XGBClassifier()
gradiant2 = XGB_model2.fit(train_news2, train_news['Label'])
y_pred2 = gradiant2.predict(test_news2)


# print(confusion_matrix(test_news['Label'], y_pred))
print("Sentiment Accuracy with Trend", accuracy_score(test_news['Label'], y_pred2))
print("f1_score", f1_score(test_news['Label'], y_pred2, average='weighted'))
# print(y_pred)

Sentiment Accuracy with Trend 0.6153846153846154
f1_score 0.6153846153846153


### Weighted XGBoost (attempted to put weight towards headlines) Vanessa

In [273]:
train_sentiment_weight = train_news
test_sentiment_weight = test_news

train_sentiment_weight = train_sentiment_weight.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in train_sentiment_weight:
    train_sentiment_weight[column] = train_sentiment_weight[column].apply(analize_sentiment)
train_sentiment_weight = train_sentiment_weight + 10

test_sentiment_weight = test_sentiment_weight.drop(['Date', 'Label', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'], axis=1)
for column in test_sentiment_weight:
    test_sentiment_weight[column] = test_sentiment_weight[column].apply(analize_sentiment)
test_sentiment_weight = test_sentiment_weight + 10

for column in train_news:
    if not train_news[column].empty:
        empty_data = train_news[column]
    else:
        weighted_data = train_news[column]

weighted_XGB = XGBClassifier()
weighted_XGB.fit(train_sentiment_weight, train_news['Label'], sample_weight=weighted_data)
y_pred_weight = weighted_XGB.predict(test_sentiment_weight, ntree_limit=500)



In [274]:
print("Weighted Accuracy", accuracy_score(test_news['Label'], y_pred_weight))
print("F1 weighted", f1_score(test_news['Label'], y_pred_weight, average='weighted'))

Weighted Accuracy 0.46153846153846156
F1 weighted 0.291497975708502


# Weighted - using trends (Erika)

In [275]:
def get_train_weight(train_sentiment):
    train_weight = []

    for across in range(len(train_sentiment)):
        for i in range(10):
            train_weight.append(0.5)
        for x in train_sentiment:
            if x not in ['Date', 'Label', '1', '2','3','4','5','6','7','8','9', '10'] :
                if train_sentiment[x][across] == 10.0:
                    train_weight.append(0.5)
                else:
                    train_weight.append(0.8)
    return train_weight

In [276]:
def get_test_weight(train_sentiment, test_sentiment):

    test_weight = []
    j = len(train_sentiment)
    for across in range(len(test_sentiment)):
        for i in range(10):
            test_weight.append(0.5)
        for x in test_sentiment:
            if x not in ['Date', 'Label', '1', '2','3','4','5','6','7','8','9', '10'] :
                if test_sentiment[x][j] == 10.0:
                    test_weight.append(0.5)
                else:
                    test_weight.append(0.8)
        j = j + 1

In [277]:
train_weight = get_train_weight(train_sentiment)
test_weight = get_test_weight(train_sentiment, test_sentiment)

XGB_model4 = XGBClassifier()

num_round = 2
param = {'max_depth' : 2, 'eta': 1, 'objective':'binary:logistic' }
gradiant4 = XGB_model4.fit(train_news2, train_news['Label'], feature_weights = train_weight)
y_pred4 = gradiant4.predict(test_news2)

print("Sentiment Accuracy with Trend with weight", accuracy_score(test_news['Label'], y_pred4))
print("f1_score", f1_score(test_news['Label'], y_pred4, average='weighted'))


Sentiment Accuracy with Trend with weight 0.6153846153846154
f1_score 0.6153846153846153


In [278]:
print("Sentiment Weighted Accuracy 2", accuracy_score(test_news['Label'], y_pred_weight))
print("F1 weighted", f1_score(test_news['Label'], y_pred_weight, average='weighted'))

Sentiment Weighted Accuracy 2 0.46153846153846156
F1 weighted 0.291497975708502


In [279]:
## doesn't work- needs to be based on row numbers 

In [280]:
# dtrain = xgb.DMatrix( data = train_news2, label = train_news['Label'], weight = train_weight)
# dtest = xgb.DMatrix(data = test_news2, label = test_news['Label'], weight = test_weight)
# params  = dict(max_depth=2, eta=1, verbose=0, nthread=2, eval_metric = "auc",
#               objective="binary:logistic")

# m = xgb.train(params, dtrain)

# y_predictions = m.predict(dtest)

# i = 0
# y = []
# for p in y_predictions:
#     if p > 0.49:
#         y.append(1)
#     else:
#         y.append(0)
#     i = i + 1
# print("Sentiment Accuracy with Trend with weight", accuracy_score(test_news['Label'], y))
# print("f1_score", f1_score(test_news['Label'], y, average='weighted'))

### All scores are printed out for comparison

In [283]:
print("Base model", accuracy1)
print("Random Forest", accuracyrf)
print("Sentiment Accuracy", accuracy_score(test_news['Label'], y_pred))
print("Sentiment Accuracy with Trends", accuracy_score(test_news['Label'], y_pred2))
print("Weighted Accuracy (Vans)", accuracy_score(test_news['Label'], y_pred_weight))
print("Sentiment Accuracy with Trend with weight (Erika)", accuracy_score(test_news['Label'], y_pred4))

Base model 0.46153846153846156
Random Forest 0.46153846153846156
Sentiment Accuracy 0.46153846153846156
Sentiment Accuracy with Trends 0.6153846153846154
Weighted Accuracy (Vans) 0.46153846153846156
Sentiment Accuracy with Trend with weight (Erika) 0.6153846153846154
