In [1]:
# Code reference: https://github.com/algosenses/Stock_Market_Sentiment_Analysis/blob/master/model_ml.py
# The modified code gives access to the fitted models for finetune. 

import os
import pandas as pd

import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils.extmath import density
from sklearn import svm
from sklearn import naive_bayes
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.utils import shuffle
import numpy as np


from config import *
def load_dataset():
    pos_file = os.path.join(data_path, pos_corpus)
    neg_file = os.path.join(data_path, neg_corpus)

    pos_sents = []
    with open(pos_file, 'r', encoding='utf-8') as f:
        for sent in f:
            pos_sents.append(sent.replace('\n', ''))

    neg_sents = []
    with open(neg_file, 'r', encoding='utf-8') as f:
        for sent in f:
            neg_sents.append(sent.replace('\n', ''))

    balance_len = min(len(pos_sents), len(neg_sents))

    pos_df = pd.DataFrame(pos_sents, columns=['text_seg'])
    pos_df['polarity'] = 1
    pos_df = pos_df[:balance_len]

    neg_df = pd.DataFrame(neg_sents, columns=['text_seg'])
    neg_df['polarity'] = 0
    neg_df = neg_df[:balance_len]

    return pd.concat([pos_df, neg_df]).reset_index(drop=True)


print('Loading dataset...')

dataset = load_dataset()

print('Dataset size ', len(dataset))



def load_dataset_tokenized():
    pos_file = os.path.join(data_path, pos_corpus)
    neg_file = os.path.join(data_path, neg_corpus)

    pos_sents = []
    with open(pos_file, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.split(' ')
            sent = []
            for t in tokens:
                if t.strip():
                    sent.append(t.strip())
            pos_sents.append(sent)

    neg_sents = []
    with open(neg_file, 'r', encoding='utf-8') as f:
        for line in f:
            tokens = line.split(' ')
            sent = []
            for t in tokens:
                if t.strip():
                    sent.append(t.strip())
            neg_sents.append(sent)

    balance_len = min(len(pos_sents), len(neg_sents))

    texts = pos_sents + neg_sents
    labels = [1] * balance_len + [0] * balance_len

    return texts, labels


def KFold_validation(clf, X, y):
    acc = []
    precision, recall, f1_score = [], [], []


    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train, test in kf.split(X):
        X_train = [X[i] for i in train]
        X_test = [X[i] for i in test]
        y_train = [y[i] for i in train]
        y_test = [y[i] for i in test]

        def dummy_fun(doc):
            return doc

        vectorizer = TfidfVectorizer(analyzer='word',
                                     tokenizer=dummy_fun,
                                     preprocessor=dummy_fun,
                                     token_pattern=None)

        vectorizer.fit(X_train)
        X_train = vectorizer.transform(X_train)
        X_test = vectorizer.transform(X_test)

        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

        acc.append(metrics.accuracy_score(y_test, preds))
        precision.append(metrics.precision_score(y_test, preds))
        recall.append(metrics.recall_score(y_test, preds))
        f1_score.append(metrics.f1_score(y_test, preds))


    return clf, (np.mean(acc), np.mean(precision), np.mean(recall), np.mean(f1_score))


def benchmark_clfs():
    
    
    print('Loading dataset...')

    X, y = load_dataset_tokenized()

    classifiers = [
        ('SVC', svm.SVC()),
        ('LinearSVC', svm.LinearSVC()),
        ('XGB',XGBClassifier()),
        ('LightGBM',LGBMClassifier()),
        ('LogisticReg', LogisticRegression())

    ]

    cols = ['Model', 'accuracy',  'precision', 'recall', 'f1_score']
    scores = []
    model_dict = {}
    
    for name, clf in classifiers:
        model, score = KFold_validation(clf, X, y)
        model_dict[name] = model
        row = [name]
        row.extend(score)
        scores.append(row)

    df = pd.DataFrame(scores, columns=cols).T
    df.columns = df.iloc[0]
    df.drop(df.index[[0]], inplace=True)
    df = df.apply(pd.to_numeric, errors='ignore')

    return model_dict ,df

def dummy_fun(doc):
        return doc

def eval_model():
    print('Loading dataset...')

    X, y = load_dataset_tokenized()

    clf = svm.LinearSVC()

    vectorizer = TfidfVectorizer(analyzer='word',
                                 tokenizer=dummy_fun,
                                 preprocessor=dummy_fun,
                                 token_pattern=None)
    
    X = vectorizer.fit_transform(X)

    print('Train model...')
    clf.fit(X, y)

    print('Loading comments...')

    df_raw = pd.read_csv(test_path)
    texts = df_raw['title_seg']
    texts = vectorizer.transform(texts.values.astype('U')) 
    preds = clf.predict(texts)

    df_raw['polarity'] = preds

ml_models, scores = benchmark_clfs()
print(scores)
scores.to_csv(ml_performance_path, float_format='%.4f')

#eval_model()

Loading dataset...
Dataset size  9214
Loading dataset...
Model           SVC  LinearSVC       XGB  LightGBM  LogisticReg
accuracy   0.883764   0.881593  0.815391  0.816910     0.880834
precision  0.880272   0.880544  0.770845  0.827487     0.879024
recall     0.888000   0.882481  0.897100  0.802796     0.882941
f1_score   0.884048   0.881483  0.829120  0.813998     0.880898


In [36]:
SVC = ml_models['SVC']

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

def dummy_fun(doc):
    return doc

# Load the dataset
X, y = load_dataset_tokenized()

# Initialize the vectorizer
vectorizer = TfidfVectorizer(analyzer='word',
                             tokenizer=dummy_fun,
                             preprocessor=dummy_fun,
                             token_pattern=None)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer.fit(X_train)
# Transform the training and test data
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=SVC, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_clf = grid_search.best_estimator_
best_params = grid_search.best_params_

# Make predictions on the test set using the best model
preds = best_clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.89      0.89      0.89       946
           1       0.88      0.88      0.88       897

    accuracy                           0.89      1843
   macro avg       0.89      0.89      0.89      1843
weighted avg       0.89      0.89      0.89      1843



In [38]:
best_params

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}