In [1]:
import os
from time import time
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn import svm
from sklearn import naive_bayes
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.utils import shuffle

# Loading Dataset

In [2]:
comment_file = './data/stock_comments_seg.csv'
data_path = './data'
pos_corpus = 'positive.txt'
neg_corpus = 'negative.txt'

# Concact positive and negative corpus together, and return with corresponding tokens
def load_dataset_tokenized():
    pos_file = os.path.join(data_path, pos_corpus)
    neg_file = os.path.join(data_path, neg_corpus)

    pos_sents = []
    with open(pos_file, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.split(' ')
            sent = []
            for t in tokens:
                if t.strip():
                    sent.append(t.strip())
            pos_sents.append(sent)

    neg_sents = []
    with open(neg_file, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.split(' ')
            sent = []
            for t in tokens:
                if t.strip():
                    sent.append(t.strip())
            neg_sents.append(sent)
            
    # Ensure same number of pos and neg reviews 
    balance_len = min(len(pos_sents), len(neg_sents))

    texts = pos_sents + neg_sents
    labels = [1] * balance_len + [0] * balance_len

    return texts, labels

# Evalauting Models

In [3]:
def dummy_fun(doc):
        return doc
    
def KFold_validation(clf, X, y):
    
    # initialize output
    acc = []
    pos_precision, pos_recall, pos_f1_score = [], [], []
    neg_precision, neg_recall, neg_f1_score = [], [], []

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train, test in kf.split(X):
        X_train = [X[i] for i in train]
        X_test = [X[i] for i in test]
        y_train = [y[i] for i in train]
        y_test = [y[i] for i in test]

        vectorizer = TfidfVectorizer(analyzer='word',
                                     tokenizer=dummy_fun,
                                     preprocessor=dummy_fun,
                                     token_pattern=None)

        vectorizer.fit(X_train)
        # vectorize train and test sets
        X_train = vectorizer.transform(X_train)
        X_test = vectorizer.transform(X_test)
        # training the model
        clf.fit(X_train, y_train)
        # get prediction
        preds = clf.predict(X_test)

        acc.append(metrics.accuracy_score(y_test, preds))
        pos_precision.append(metrics.precision_score(y_test, preds, pos_label=1))
        pos_recall.append(metrics.recall_score(y_test, preds, pos_label=1))
        pos_f1_score.append(metrics.f1_score(y_test, preds, pos_label=1))
        neg_precision.append(metrics.precision_score(y_test, preds, pos_label=0))
        neg_recall.append(metrics.recall_score(y_test, preds, pos_label=0))
        neg_f1_score.append(metrics.f1_score(y_test, preds, pos_label=0))

    return (np.mean(acc), np.mean(pos_precision), np.mean(pos_recall), np.mean(pos_f1_score),
            np.mean(neg_precision), np.mean(neg_recall), np.mean(neg_f1_score))

In [4]:
# Evaluate different models, return a DataFrame containing the performance metrics
def evaluate_models():

    X, y = load_dataset_tokenized()

    cols = ['metrics', 'accuracy',  'pos_precision', 'pos_recall', 'pos_f1_score', 'neg_precision', 'neg_recall', 'neg_f1_score']
    scores = []

    classifiers = [
        ('LinearSVC', svm.LinearSVC()),
        ('LogisticReg', LogisticRegression()),
        ('SGD', SGDClassifier()),
        ('MultinomialNB', naive_bayes.MultinomialNB()),
        ('KNN', KNeighborsClassifier()),
        ('DecisionTree', DecisionTreeClassifier()),
        ('RandomForest', RandomForestClassifier()),
        ('AdaBoost', AdaBoostClassifier(base_estimator=LogisticRegression()))]

    for name, clf in classifiers:
        score = KFold_validation(clf, X, y)
        row = [name]
        row.extend(score)
        scores.append(row)

    df = pd.DataFrame(scores, columns=cols).T
    df.columns = df.iloc[0]
    df.drop(df.index[[0]], inplace=True)
    df = df.apply(pd.to_numeric, errors='ignore')

    return df

In [5]:
# Get model performances
evaluate_models()

metrics,LinearSVC,LogisticReg,SGD,MultinomialNB,KNN,DecisionTree,RandomForest,AdaBoost
accuracy,0.881593,0.880834,0.882895,0.87964,0.820924,0.794227,0.8435,0.771659
pos_precision,0.880544,0.879024,0.884003,0.882137,0.80825,0.810372,0.866348,0.797129
pos_recall,0.882481,0.882941,0.880842,0.876015,0.840896,0.767061,0.812089,0.79865
pos_f1_score,0.881483,0.880898,0.882393,0.879043,0.824202,0.788035,0.838269,0.765202
neg_precision,0.882443,0.882578,0.881674,0.876666,0.834168,0.779216,0.822977,0.825027
neg_recall,0.880121,0.87821,0.88422,0.883188,0.800378,0.821041,0.874529,0.759168
neg_f1_score,0.881251,0.880306,0.882916,0.87989,0.816867,0.7995,0.847906,0.766304


# Apply Bagging Algorithm

In [6]:
# only keep frist 7 models with best performances
# save the prediction in another file
def predict_with_bagging():

    # Loading training data
    X, y = load_dataset_tokenized()
    vectorizer = TfidfVectorizer(analyzer='word',
                                 tokenizer=dummy_fun,
                                 preprocessor=dummy_fun,
                                 token_pattern=None)

    X = vectorizer.fit_transform(X)

    # Loading review file
    df = pd.read_csv(comment_file)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    # Reformat the data
    df['created_time'] = pd.to_datetime(df['created_time'], format='%Y-%m-%d %H:%M:%S')
    # Split the word in each review
    df['title'].apply(lambda x: [w.strip() for w in x.split()])

    texts = df['title']
    texts = vectorizer.transform(texts)

    # initialize the output
    df['preds'] = 0

    classifiers = [
    ('LinearSVC', svm.LinearSVC()),
    ('LogisticReg', LogisticRegression()),
    ('SGD', SGDClassifier()),
    ('MultinomialNB', naive_bayes.MultinomialNB()),
    ('KNN', KNeighborsClassifier()),
    ('DecisionTree', DecisionTreeClassifier()),
    ('RandomForest', RandomForestClassifier())]
    
    for name, clf in classifiers:
        clf.fit(X, y)
        pred = clf.predict(texts)
        df[name] = pred
        df['preds'] = df['preds'] + df[name]
    df['preds'] = df['preds'].apply(lambda x: 0 if x < 4 else 1)
    
    # generate the predcition
    df.to_csv('generated_data/stock_comments_predicted.csv', index=False)
    print("Prediction has saved to file")
    
    return df

In [7]:
prediction = predict_with_bagging()

Prediction has saved to file


# Compute BI-index

In [8]:
def get_bi(row):
    
    pos = row[row == 1].count()
    neg = row[row == 0].count()
    bi = np.log((1 + pos) / (1 + neg))

    return bi

In [9]:
# Group reviews by date
grouped = prediction['preds'].groupby(prediction.created_time.dt.date)
# Generate bi score of each day
bi_index = grouped.apply(get_bi)
bi_index = bi_index.rename("BI")
# Get trading date according to Shanghai Index file
quotes = pd.read_csv('./data/sh000001.csv', parse_dates=['date'])
quotes.set_index('date', inplace=True)
# Merge sentiment 
bi_index.index = pd.to_datetime(bi_index.index)
merged = pd.merge(bi_index, quotes, how='left', left_index=True, right_index=True)
merged.fillna(method='ffill', inplace=True)
# Save the bi index
merged.to_csv('generated_data/bi_idx.csv')