This notebook for sentiment analysis is inspired by [this article](https://medium.com/the-innovation/sentiment-analysis-using-lightgbm-alternative-approach-to-rnn-and-lstm-55ee6f32e066). 

We will be training a LightGBM model to perform this multiclass prediction task. Hyperparameters will be tuned by Optuna.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string, re
import spacy

import nltk
from nltk.corpus import stopwords
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups
from textblob import TextBlob
import en_core_web_sm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import log_loss, f1_score, classification_report, confusion_matrix, \
roc_auc_score, precision_score, accuracy_score, f1_score, recall_score, plot_roc_curve, roc_curve, auc

import tensorflow_hub as hub
import lightgbm as lgb
import optuna

In [2]:
data = pd.read_csv("news_headlines_train.csv")
data

Unnamed: 0,text,sentiment
0,"In addition , a further 29 employees can be la...",-1
1,The authorisation is in force until the end of...,0
2,The value of the deal was not disclosed .,0
3,You need to be ready when the window opens up ...,0
4,Major Order in India Comptel Corporation has r...,1
...,...,...
3188,The Insolvency Act regulates the amount of deb...,0
3189,We have also cut our price projections for pap...,-1
3190,"Tyrvaan Sanomat , published twice a week by Ty...",0
3191,"pct lower at 4,442.10 .",0


# Preprocessing and Feature Engineering

Here, we preprocess the data using the following techniques:
- Set text to lowercase
- replace the " 's" instances prevalent throughout the text
- Removing punctuation
- Removing stopwords
- Lemmatization

In [3]:
stop_words = stopwords.words("english")

sp = en_core_web_sm.load()
lookups = Lookups()
lemm = Lemmatizer(lookups)

In [4]:
def remove_punctuation(text):
    
    punct = list(string.punctuation)
    
    for punctuation in punct:
        text = text.replace(punctuation, ' ')
    return text

def remove_stopwords(text):
    text_list = [word for word in text.split() if not word in stop_words]
    return ' '.join(text_list)

def lemmatize(text):
    dummy = []    
    for word in sp(text):
        dummy.append(word.lemma_)
    return ' '.join(dummy)

def process_text(data, col):
    """
    Processes the text by sowercase, replace " 's", remove punctuation, 
    remove stopwords and lemmatization.
    
    Parameters:
    -----------
        data (dataframe)
        col (string): name of the Series containing text data to be processed
        
    Returns:
    --------
         df (dataframe): dataframe containing an additional column (token) for processed text
    """
    df = data.copy()
    df['token'] = df[col].apply(lambda x: x.lower())
    df['token'] = df.token.apply(lambda x: x.replace(" 's'", ""))
    df['token'] = df.token.apply(remove_punctuation)
    
    # removing stopwords
    df["token"] = df.token.apply(lambda x: remove_stopwords(x))
    # lemmatization
    df["token"] = df.token.apply(lambda x: lemmatize(x))
    
    return df

In [5]:
def feature_engineer(data):
    """
    Processing for subjectivity, polarity, 
    sentence encoding, TF-IDF encoding and count vectorizer encoding.
    
    Parameters:
    -----------
        data (dataframe): dataframe containing the Series 'token'
        
    Returns:
    --------
         processed_sent_tfidf_count (dataframe): dataframe with features
    """
    df = data.copy()
    # subjectivity
    df['subjectivity'] = df.token.apply(lambda text: TextBlob(text).sentiment.subjectivity)
    # polarity
    df['polarity'] = df.token.apply(lambda text: TextBlob(text).sentiment.polarity)
    # processing polarity
    df['analysis'] = df.polarity.apply(lambda score: 'Negative' if score < 0 else 'Neutral' if score == 0 else 'Positive')
    dummy_val = pd.get_dummies(df['analysis'], prefix='analysis')
    processed_df = pd.concat([df, dummy_val], axis=1)
    processed_df = processed_df.drop("analysis", axis=1)
    
    # sentence encoding
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    model = hub.load(module_url)
    sent = pd.DataFrame(np.array(model(processed_df.token)))
    processed_sent = pd.merge(processed_df, sent, left_index=True, right_index=True)
    
    # TF-IDF encoding
    tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 10), max_features=1000)
    features = tfidf.fit_transform(processed_df.token).toarray()
    features_df = pd.DataFrame(features, columns=tfidf.get_feature_names())
    processed_sent_tfidf = pd.merge(processed_sent, features_df, left_index=True, right_index=True)
    
    # Count vector encoding
    countvec = CountVectorizer(analyzer='word', ngram_range=(1, 10), max_features=1000)
    features = countvec.fit_transform(processed_df.token).toarray()
    features_df = pd.DataFrame(features, columns=countvec.get_feature_names())
    processed_sent_tfidf_count = pd.merge(processed_sent_tfidf, features_df, left_index=True, right_index=True)
    
    return processed_sent_tfidf_count

In [6]:
data_proc = process_text(data, "text")
data_eng = feature_engineer(data_proc)

In [None]:
# TRY DONT RUN THIS FIRST ---------
processed_text = process_text(data, "text")
# removing stopwords
processed_text["token"] = processed_text.token.apply(lambda x: remove_stopwords(x))
# lemmatization
processed_text["token"] = processed_text.token.apply(lambda x: lemmatize(x))
# subjectivity
processed_text['subjectivity'] = processed_text["token"].apply(lambda text: TextBlob(text).sentiment.subjectivity)
# polarity
processed_text['polarity'] = processed_text["token"].apply(lambda text: TextBlob(text).sentiment.polarity)
# processing polarity
processed_text['analysis'] = processed_text['polarity'].apply(lambda score: 'Negative' if score < 0 else 'Neutral' if score == 0 else 'Positive')
dummy_val = pd.get_dummies(processed_text['analysis'], prefix='analysis')
processed_text = pd.concat([processed_text, dummy_val], axis=1)
processed_text.drop("analysis", axis=1, inplace=True)

In [None]:
processed_text.head()

In [None]:
# Sentence encoding
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
sent = pd.DataFrame(np.array(model(processed_text.token)))
print(sent.shape)
processed_sent = pd.merge(processed_text, sent, left_index=True, right_index=True)

In [None]:
processed_sent.head()

In [None]:
# TF-IDF encoding
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 10), max_features=1000)
features = tfidf.fit_transform(processed_text.token).toarray()
features_df = pd.DataFrame(features, columns=tfidf.get_feature_names())
print(features_df.shape)
processed_sent_tfidf = pd.merge(processed_sent, features_df, left_index=True, right_index=True)

In [None]:
# Count vector encoding
countvec = CountVectorizer(analyzer='word', ngram_range=(1, 10), max_features=1000)
features = countvec.fit_transform(processed_text.token).toarray()
features_df = pd.DataFrame(features, columns=countvec.get_feature_names())
print(features_df.shape)
processed_sent_tfidf_count = pd.merge(processed_sent_tfidf, features_df, left_index=True, right_index=True)
# ------------------

In [7]:
# train-test-split

X = data_eng.drop(["text", "sentiment", "token"], axis=1)
y = data_eng["sentiment"] + 1 # adjust the label space as (0,1,2) because LGB can't take -1

# training 60%, validation 20%, test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=11)

# Hyperparameter Optimization

In [8]:
def objective_lgb(trial):
    dtrain = lgb.Dataset(X_train, label=y_train)

    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "num_classes": 3,
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_depth": trial.suggest_int("max_depth", 2, 100)
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(X_val)
    pred_labels = np.argmax(preds, axis=1)
    labels = [0, 1, 2]
    auc = roc_auc_score(label_binarize(y_val.values, classes=labels), label_binarize(pred_labels, classes=labels))
    return auc

study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=300)

print("Number of finished trials: {}".format(len(study_lgb.trials)))
print("Best trial:")
lgb_best_trial = study_lgb.best_trial
print("  Value: {}".format(lgb_best_trial.value))
print("  Params: ")
for key, value in lgb_best_trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-11-14 14:37:15,932][0m A new study created in memory with name: no-name-d521a8fd-8eb1-4de3-9fa0-02a66f4b9274[0m
[32m[I 2020-11-14 14:37:16,920][0m Trial 0 finished with value: 0.674318379880829 and parameters: {'lambda_l1': 0.08451279979606717, 'lambda_l2': 3.542536582539453e-05, 'num_leaves': 205, 'feature_fraction': 0.5014005784068217, 'bagging_fraction': 0.6771843311417529, 'bagging_freq': 2, 'min_child_samples': 59, 'max_depth': 5}. Best is trial 0 with value: 0.674318379880829.[0m
[32m[I 2020-11-14 14:37:17,917][0m Trial 1 finished with value: 0.6813121638020793 and parameters: {'lambda_l1': 0.002027086588610803, 'lambda_l2': 1.9703863289873658, 'num_leaves': 238, 'feature_fraction': 0.6397768846393576, 'bagging_fraction': 0.5818202779001234, 'bagging_freq': 3, 'min_child_samples': 88, 'max_depth': 64}. Best is trial 1 with value: 0.6813121638020793.[0m
[32m[I 2020-11-14 14:37:32,202][0m Trial 2 finished with value: 0.6378402938997926 and parameters: {'lambd

[32m[I 2020-11-14 14:38:18,308][0m Trial 21 finished with value: 0.6715386780277656 and parameters: {'lambda_l1': 0.0004957522893758777, 'lambda_l2': 0.003344979055252797, 'num_leaves': 9, 'feature_fraction': 0.6711472674970221, 'bagging_fraction': 0.6907402016159749, 'bagging_freq': 7, 'min_child_samples': 31, 'max_depth': 4}. Best is trial 1 with value: 0.6813121638020793.[0m
[32m[I 2020-11-14 14:38:19,006][0m Trial 22 finished with value: 0.6710266192330084 and parameters: {'lambda_l1': 6.670193168027152e-05, 'lambda_l2': 0.002120417066488674, 'num_leaves': 3, 'feature_fraction': 0.7476130718978691, 'bagging_fraction': 0.6945980011377281, 'bagging_freq': 4, 'min_child_samples': 34, 'max_depth': 3}. Best is trial 1 with value: 0.6813121638020793.[0m
[32m[I 2020-11-14 14:38:20,105][0m Trial 23 finished with value: 0.6592008568509408 and parameters: {'lambda_l1': 0.0011489921971986874, 'lambda_l2': 7.120421981931105, 'num_leaves': 90, 'feature_fraction': 0.6231304396942012, 'ba

[32m[I 2020-11-14 14:38:59,792][0m Trial 43 finished with value: 0.6768572127033874 and parameters: {'lambda_l1': 0.00080494752936564, 'lambda_l2': 0.33164987190200007, 'num_leaves': 243, 'feature_fraction': 0.6040981158189681, 'bagging_fraction': 0.5922832883162404, 'bagging_freq': 6, 'min_child_samples': 9, 'max_depth': 4}. Best is trial 1 with value: 0.6813121638020793.[0m
[32m[I 2020-11-14 14:39:04,023][0m Trial 44 finished with value: 0.6746057159495181 and parameters: {'lambda_l1': 0.001302132814779608, 'lambda_l2': 2.832574290697692, 'num_leaves': 243, 'feature_fraction': 0.5300224465419668, 'bagging_fraction': 0.5863889854904547, 'bagging_freq': 6, 'min_child_samples': 11, 'max_depth': 58}. Best is trial 1 with value: 0.6813121638020793.[0m
[32m[I 2020-11-14 14:39:14,018][0m Trial 45 finished with value: 0.6517291012573102 and parameters: {'lambda_l1': 0.0009318910298824387, 'lambda_l2': 0.30239590523027243, 'num_leaves': 241, 'feature_fraction': 0.5276376986314594, 'ba

[32m[I 2020-11-14 14:40:23,408][0m Trial 65 finished with value: 0.6540178436987913 and parameters: {'lambda_l1': 0.00010754793917052278, 'lambda_l2': 0.0010420086680628837, 'num_leaves': 230, 'feature_fraction': 0.5577177791372656, 'bagging_fraction': 0.5591003069755159, 'bagging_freq': 5, 'min_child_samples': 90, 'max_depth': 59}. Best is trial 1 with value: 0.6813121638020793.[0m
[32m[I 2020-11-14 14:40:24,296][0m Trial 66 finished with value: 0.664653475717918 and parameters: {'lambda_l1': 5.7035302503988295e-05, 'lambda_l2': 2.7378679410984084e-05, 'num_leaves': 215, 'feature_fraction': 0.6064198750586068, 'bagging_fraction': 0.598206774677884, 'bagging_freq': 7, 'min_child_samples': 96, 'max_depth': 5}. Best is trial 1 with value: 0.6813121638020793.[0m
[32m[I 2020-11-14 14:40:29,008][0m Trial 67 finished with value: 0.6563567258256404 and parameters: {'lambda_l1': 0.0036170195888852673, 'lambda_l2': 1.4888157868107256e-06, 'num_leaves': 245, 'feature_fraction': 0.7035427

[32m[I 2020-11-14 14:41:09,691][0m Trial 87 finished with value: 0.6762371604471147 and parameters: {'lambda_l1': 1.3889076753665979, 'lambda_l2': 1.2532785012497567, 'num_leaves': 231, 'feature_fraction': 0.4474106772094827, 'bagging_fraction': 0.5539912719278002, 'bagging_freq': 1, 'min_child_samples': 42, 'max_depth': 58}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:41:10,980][0m Trial 88 finished with value: 0.6550540419340024 and parameters: {'lambda_l1': 0.9946818319717636, 'lambda_l2': 0.4707040465583889, 'num_leaves': 204, 'feature_fraction': 0.43029553870877907, 'bagging_fraction': 0.6726722205207053, 'bagging_freq': 2, 'min_child_samples': 42, 'max_depth': 58}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:41:11,922][0m Trial 89 finished with value: 0.6472695383193741 and parameters: {'lambda_l1': 7.650583840736975, 'lambda_l2': 1.29953698035158, 'num_leaves': 211, 'feature_fraction': 0.40428450859642995, 'baggin

[32m[I 2020-11-14 14:41:38,511][0m Trial 109 finished with value: 0.6427654396900345 and parameters: {'lambda_l1': 0.760297911297571, 'lambda_l2': 0.208531931409313, 'num_leaves': 221, 'feature_fraction': 0.5374358709147813, 'bagging_fraction': 0.5693301344351511, 'bagging_freq': 2, 'min_child_samples': 11, 'max_depth': 58}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:41:39,580][0m Trial 110 finished with value: 0.645619455980634 and parameters: {'lambda_l1': 1.4568062892863167, 'lambda_l2': 0.4392849288246415, 'num_leaves': 245, 'feature_fraction': 0.5670157998943878, 'bagging_fraction': 0.5255400583759999, 'bagging_freq': 4, 'min_child_samples': 55, 'max_depth': 67}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:41:40,187][0m Trial 111 finished with value: 0.6753431365372419 and parameters: {'lambda_l1': 0.00046002207898264045, 'lambda_l2': 0.0003374223537248373, 'num_leaves': 232, 'feature_fraction': 0.4644854839137272,

[32m[I 2020-11-14 14:42:08,563][0m Trial 131 finished with value: 0.6690012988729935 and parameters: {'lambda_l1': 0.0005470517244408843, 'lambda_l2': 0.0038588555267351525, 'num_leaves': 234, 'feature_fraction': 0.6849950852294957, 'bagging_fraction': 0.7056839631374672, 'bagging_freq': 2, 'min_child_samples': 46, 'max_depth': 68}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:42:11,857][0m Trial 132 finished with value: 0.6714684592436453 and parameters: {'lambda_l1': 4.080565158936859, 'lambda_l2': 0.00038359580978116056, 'num_leaves': 123, 'feature_fraction': 0.5801457994625667, 'bagging_fraction': 0.724020223617683, 'bagging_freq': 2, 'min_child_samples': 5, 'max_depth': 5}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:42:13,531][0m Trial 133 finished with value: 0.663229007648967 and parameters: {'lambda_l1': 0.00024072650874118465, 'lambda_l2': 0.0030761565565270666, 'num_leaves': 222, 'feature_fraction': 0.706294471

[32m[I 2020-11-14 14:42:52,308][0m Trial 153 finished with value: 0.6656901520259008 and parameters: {'lambda_l1': 0.0001817011711239898, 'lambda_l2': 1.9305023656905118, 'num_leaves': 233, 'feature_fraction': 0.6233594072882217, 'bagging_fraction': 0.6690383368975874, 'bagging_freq': 4, 'min_child_samples': 40, 'max_depth': 87}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:42:54,317][0m Trial 154 finished with value: 0.6666718522924103 and parameters: {'lambda_l1': 0.0002491575518212775, 'lambda_l2': 1.3330300192691964, 'num_leaves': 239, 'feature_fraction': 0.6196617846916725, 'bagging_fraction': 0.7460330752874413, 'bagging_freq': 4, 'min_child_samples': 44, 'max_depth': 70}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:42:56,344][0m Trial 155 finished with value: 0.6751088739782581 and parameters: {'lambda_l1': 2.7627107345627086e-06, 'lambda_l2': 0.7402621147150408, 'num_leaves': 255, 'feature_fraction': 0.74282343227

[32m[I 2020-11-14 14:43:27,630][0m Trial 175 finished with value: 0.6587771944172105 and parameters: {'lambda_l1': 8.823874853264156e-05, 'lambda_l2': 5.68828646481127, 'num_leaves': 244, 'feature_fraction': 0.6259872938888857, 'bagging_fraction': 0.633530552354237, 'bagging_freq': 4, 'min_child_samples': 44, 'max_depth': 80}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:43:28,803][0m Trial 176 finished with value: 0.6667549371143252 and parameters: {'lambda_l1': 0.0014594010670245818, 'lambda_l2': 0.00030112214996961373, 'num_leaves': 234, 'feature_fraction': 0.7207507219497068, 'bagging_fraction': 0.5803572055619205, 'bagging_freq': 6, 'min_child_samples': 84, 'max_depth': 77}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:43:30,413][0m Trial 177 finished with value: 0.6673956868379491 and parameters: {'lambda_l1': 0.00811917619184079, 'lambda_l2': 0.6767354768687025, 'num_leaves': 256, 'feature_fraction': 0.5934090231910

[32m[I 2020-11-14 14:44:28,320][0m Trial 197 finished with value: 0.659985362453339 and parameters: {'lambda_l1': 0.10725581413180396, 'lambda_l2': 1.6426996126763944, 'num_leaves': 224, 'feature_fraction': 0.4857519893506363, 'bagging_fraction': 0.6997801998832912, 'bagging_freq': 2, 'min_child_samples': 11, 'max_depth': 53}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:44:29,863][0m Trial 198 finished with value: 0.6635722167012076 and parameters: {'lambda_l1': 6.153460712743362e-05, 'lambda_l2': 6.411405111580826, 'num_leaves': 233, 'feature_fraction': 0.6352409772387716, 'bagging_fraction': 0.6186378672308859, 'bagging_freq': 1, 'min_child_samples': 45, 'max_depth': 91}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:44:31,319][0m Trial 199 finished with value: 0.6504641500303231 and parameters: {'lambda_l1': 0.00047410708977953654, 'lambda_l2': 2.6170715599481467, 'num_leaves': 246, 'feature_fraction': 0.528278003993206

[32m[I 2020-11-14 14:44:55,367][0m Trial 219 finished with value: 0.666770454897068 and parameters: {'lambda_l1': 0.000373909379617573, 'lambda_l2': 2.293409517307981, 'num_leaves': 185, 'feature_fraction': 0.5186471346131599, 'bagging_fraction': 0.7319179780877944, 'bagging_freq': 1, 'min_child_samples': 44, 'max_depth': 9}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:44:57,602][0m Trial 220 finished with value: 0.6597774104296791 and parameters: {'lambda_l1': 0.00011887887520994373, 'lambda_l2': 0.010154459914127107, 'num_leaves': 231, 'feature_fraction': 0.5735464351962516, 'bagging_fraction': 0.7057971839059035, 'bagging_freq': 2, 'min_child_samples': 40, 'max_depth': 55}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:44:58,538][0m Trial 221 finished with value: 0.664377498031785 and parameters: {'lambda_l1': 0.12554437821208767, 'lambda_l2': 3.8807481666149895e-06, 'num_leaves': 193, 'feature_fraction': 0.490428369450

[32m[I 2020-11-14 14:45:30,511][0m Trial 241 finished with value: 0.6597176203713367 and parameters: {'lambda_l1': 0.0004149308125695786, 'lambda_l2': 0.0003539178963177358, 'num_leaves': 221, 'feature_fraction': 0.7406098960556445, 'bagging_fraction': 0.5710686580521915, 'bagging_freq': 1, 'min_child_samples': 88, 'max_depth': 8}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:45:31,460][0m Trial 242 finished with value: 0.6585874021619224 and parameters: {'lambda_l1': 0.0007673750983603476, 'lambda_l2': 8.3031150010608e-05, 'num_leaves': 240, 'feature_fraction': 0.7274753228005133, 'bagging_fraction': 0.6968610109678536, 'bagging_freq': 6, 'min_child_samples': 90, 'max_depth': 4}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:45:32,010][0m Trial 243 finished with value: 0.6567288145797243 and parameters: {'lambda_l1': 0.04564318528042512, 'lambda_l2': 1.3370123283785713, 'num_leaves': 216, 'feature_fraction': 0.603112380925

[32m[I 2020-11-14 14:46:03,117][0m Trial 263 finished with value: 0.671672765332923 and parameters: {'lambda_l1': 7.058201915968007e-06, 'lambda_l2': 8.635509900486229, 'num_leaves': 256, 'feature_fraction': 0.6020965666371816, 'bagging_fraction': 0.5681623752815386, 'bagging_freq': 1, 'min_child_samples': 81, 'max_depth': 45}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:46:04,144][0m Trial 264 finished with value: 0.6681858055083199 and parameters: {'lambda_l1': 2.878577118306107e-06, 'lambda_l2': 9.712348345382633, 'num_leaves': 251, 'feature_fraction': 0.6063981077031402, 'bagging_fraction': 0.5584279053701087, 'bagging_freq': 1, 'min_child_samples': 79, 'max_depth': 48}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:46:05,265][0m Trial 265 finished with value: 0.6557755896164479 and parameters: {'lambda_l1': 1.0000579649220116e-06, 'lambda_l2': 6.521447530835704, 'num_leaves': 256, 'feature_fraction': 0.623935988913114

[32m[I 2020-11-14 14:46:31,325][0m Trial 285 finished with value: 0.6506863828606214 and parameters: {'lambda_l1': 0.013651175621343789, 'lambda_l2': 0.16667608063374523, 'num_leaves': 256, 'feature_fraction': 0.6948522485620997, 'bagging_fraction': 0.5261264925687139, 'bagging_freq': 7, 'min_child_samples': 91, 'max_depth': 50}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:46:32,387][0m Trial 286 finished with value: 0.6670344117587824 and parameters: {'lambda_l1': 9.946303352468245e-05, 'lambda_l2': 0.00013415807427891903, 'num_leaves': 226, 'feature_fraction': 0.704255752170895, 'bagging_fraction': 0.5451590852256176, 'bagging_freq': 7, 'min_child_samples': 89, 'max_depth': 50}. Best is trial 69 with value: 0.6864696801647412.[0m
[32m[I 2020-11-14 14:46:33,361][0m Trial 287 finished with value: 0.6698772478516734 and parameters: {'lambda_l1': 8.23801338341192e-05, 'lambda_l2': 0.5847874255565941, 'num_leaves': 244, 'feature_fraction': 0.4435749783

Number of finished trials: 300
Best trial:
  Value: 0.6864696801647412
  Params: 
    lambda_l1: 3.7838762230735277e-06
    lambda_l2: 5.533558574907331
    num_leaves: 204
    feature_fraction: 0.7625087163682107
    bagging_fraction: 0.6117164024996633
    bagging_freq: 6
    min_child_samples: 47
    max_depth: 61


In [9]:
lgb_params = {
    **{
    "objective": "multiclass",
    "metric": "multi_logloss",
    "num_classes": 3,
    "verbosity": -1,
    "boosting_type": "gbdt"},
    **lgb_best_trial.params}

# Training, Testing and Metrics

For precision, recall and F1 score, macro-averaged results are presented. The micro-averaged equivalents are equal to accuracy.

In [10]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_clf = lgb.train(lgb_params, lgb_train, 100)
lgb_pred = lgb_clf.predict(X_test)
lgb_pred = [np.argmax(i) for i in lgb_pred]

In [11]:
labels = [0,1,2]
print("Accuracy: %.7f" %accuracy_score(y_test, lgb_pred))
print("Precision (macro): %.7f" %precision_score(y_test, lgb_pred, average="macro"))
print("Recall (macro): %.5f" %recall_score(y_test, lgb_pred, average="macro"))
print("F1 score (macro): %.7f" %f1_score(y_test, lgb_pred, average="macro"))
print("AUC: %.5f" %roc_auc_score(label_binarize(y_test.values, classes=labels), 
                                 label_binarize(lgb_pred, classes=labels)))

Accuracy: 0.7104851
Precision (macro): 0.6722978
Recall (macro): 0.54913
F1 score (macro): 0.5797378
AUC: 0.66919


# Submission Predictions

In [12]:
sub = pd.read_csv("news_headlines_test_sample_submission.csv")

In [13]:
# preprocessing

sub_proc = process_text(sub, "text")
sub_eng = feature_engineer(sub_proc)
sub1 = sub_eng.drop(["text", "sentiment", "token"], axis=1)









In [14]:
# prediction

sub_pred = lgb_clf.predict(sub1)
sub_pred = [np.argmax(i)-1 for i in sub_pred]

In [15]:
submission = sub.copy()
submission["sentiment"] = sub_pred

In [16]:
submission.to_csv("submission.csv", index=False)