In [1]:
import os
import pandas as pd
import numpy as np

# Loading Dataset

In [2]:
# open the file at specific path
# prepare the review data
def get_reviews(path, file):
    f = os.path.join(path, file)
    reviews = []
    with open(f, 'r', encoding = 'utf-8') as f:
        for line in f:
            words = line.split(' ')
            sent = []
            for word in words:
                if word.strip():
                    sent.append(word.strip())
            reviews.append(sent)
    return reviews

# add labels to the reviews, prepare the traning data X (reviews), y (labels)
def get_training_data():
    # get positive reviews
    pos_reviews = get_reviews('./data', 'positive.txt')
    # get negative reviews
    neg_reviews = get_reviews('./data', 'negative.txt')
            
    reviews = pos_reviews + neg_reviews
    sents = [1] * len(pos_reviews) + [0] * len(neg_reviews)

    return reviews, sents

# Evalauting Models

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.utils import shuffle

In [4]:
def simple_return(f):
    return f

def KFold_validation(clf, X, y):
    
    # initialize output
    acc, precision, recall, f1_score = [], [], [], []

    kf = KFold(n_splits = 5, shuffle = True, random_state = 30)
    for train, test in kf.split(X):
        X_train = [X[i] for i in train]
        X_test = [X[i] for i in test]
        y_train = [y[i] for i in train]
        y_test = [y[i] for i in test]

        # Use TF-IDF to vectorize the reviews
        vectorizer = TfidfVectorizer(analyzer = 'word',
                                     tokenizer = simple_return,
                                     preprocessor = simple_return,
                                     token_pattern =None)

        vectorizer.fit(X_train)
        # vectorize train and test sets
        X_train = vectorizer.transform(X_train)
        X_test = vectorizer.transform(X_test)
        # training the model
        clf.fit(X_train, y_train)
        # get prediction
        preds = clf.predict(X_test)
        
        # append to performance lists
        acc.append(metrics.accuracy_score(y_test, preds))
        precision.append(metrics.precision_score(y_test, preds, pos_label = 1))
        recall.append(metrics.recall_score(y_test, preds, pos_label = 1))
        f1_score.append(metrics.f1_score(y_test, preds, pos_label = 1))

    # get averave of all the metrics
    return (np.mean(acc), np.mean(precision), np.mean(recall), np.mean(f1_score))

In [5]:
# Evaluate different models, return a DataFrame containing the performance metrics
def evaluate_models(clfs):

    X, y = get_training_data()

    cols = ['metrics', 'accuracy',  'precision', 'recall', 'f1_score']
    scores = []

    for name, clf in clfs:
        score = KFold_validation(clf, X, y)
        row = [name]
        row.extend(score)
        scores.append(row)

    df = pd.DataFrame(scores, columns = cols).T
    df.columns = df.iloc[0]
    df.drop(df.index[[0]], inplace = True)
    df = df.apply(pd.to_numeric, errors = 'ignore')

    return df

In [6]:
# Get model performances
classifiers = [('LinearSVC', svm.LinearSVC()),
               ('LogisticReg', LogisticRegression()),
               ('SGD', SGDClassifier()),
               ('MultinomialNB', naive_bayes.MultinomialNB()),
               ('KNN', KNeighborsClassifier()),
               ('DecisionTree', DecisionTreeClassifier()),
               ('RandomForest', RandomForestClassifier()),
               ('AdaBoost', AdaBoostClassifier(base_estimator = LogisticRegression()))]

evaluate_models(classifiers)

metrics,LinearSVC,LogisticReg,SGD,MultinomialNB,KNN,DecisionTree,RandomForest,AdaBoost
accuracy,0.88333,0.882462,0.883873,0.877469,0.817344,0.800196,0.845019,0.826463
precision,0.884507,0.88143,0.887066,0.883472,0.809008,0.818643,0.867357,0.827792
recall,0.88201,0.883984,0.879838,0.869941,0.831042,0.77123,0.814599,0.842726
f1_score,0.883209,0.88267,0.8834,0.876548,0.819673,0.794202,0.84014,0.828838


# Apply Bagging Algorithm

In [7]:
# only keep frist 5 models with best performances
# save the prediction in another file
def predict_with_bagging(clfs):

    # Loading training data
    X, y = get_training_data()
    
    # Use TF-IDF to vectorize the reviews
    vectorizer = TfidfVectorizer(analyzer = 'word',
                                 tokenizer = simple_return,
                                 preprocessor = simple_return,
                                 token_pattern = None)

    X = vectorizer.fit_transform(X)

    # Loading review file
    df = pd.read_csv('./data/stock_comments_seg.csv')
    df.dropna(inplace = True)
    df.reset_index(drop = True, inplace = True)
    # Reformat the data
    df['created_time'] = pd.to_datetime(df['created_time'], format = '%Y-%m-%d %H:%M:%S')
    # Split the word in each review
    df['title'].apply(lambda x: [w.strip() for w in x.split()])

    reviews = df['title']
    reviews = vectorizer.transform(reviews)

    # initialize the output
    df['preds'] = 0

    for name, clf in clfs:
        clf.fit(X, y)
        pred = clf.predict(reviews)
        df[name] = pred
        df['preds'] = df['preds'] + df[name]
    df['preds'] = df['preds'].apply(lambda x: 0 if x < 3 else 1)
    
    # generate the predcition
    df.to_csv('generated_data/stock_comments_predicted.csv', index = False)
    print("Prediction has saved to file")
    
    return df

In [8]:
classifiers = [('LinearSVC', svm.LinearSVC()),
               ('LogisticReg', LogisticRegression()),
               ('SGD', SGDClassifier()),
               ('MultinomialNB', naive_bayes.MultinomialNB()),
               ('RandomForest', RandomForestClassifier())]

prediction = predict_with_bagging(classifiers)

Prediction has saved to file


# Compute BI-index

In [9]:
def get_bi(row):
    
    pos = row[row == 1].count()
    neg = row[row == 0].count()
    bi = np.log((1 + pos) / (1 + neg))

    return bi

In [10]:
# Group reviews by date
grouped = prediction['preds'].groupby(prediction.created_time.dt.date)
# Generate bi score of each day
bi_index = grouped.apply(get_bi)
bi_index = bi_index.rename("BI")
# Get trading date according to Shanghai Index file
quotes = pd.read_csv('./data/sh000001.csv', parse_dates = ['date'])
quotes.set_index('date', inplace = True)
# Merge sentiment 
bi_index.index = pd.to_datetime(bi_index.index)
merged = pd.merge(bi_index, quotes, how = 'left', left_index = True, right_index = True)
merged.fillna(method = 'ffill', inplace = True)
# Save the bi index
merged.to_csv('generated_data/bi_idx.csv')