In [None]:
import numpy as np
import pandas as pd
import re

import spacy
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from spellchecker import SpellChecker
import lightgbm as lgbm

import tqdm
import time


start_time = time.time()

# LOAD DATA SETS
# Load the train, test and submission data frames

TOY_NUM = 200

print('--------------- Data preparation ------------------')

train_df = pd.read_csv("train-data.csv")
train_df = shuffle(train_df)[0:TOY_NUM]

test_df = pd.read_csv("test-data.csv")
test_df = shuffle(test_df)[0:TOY_NUM]

submission_df = pd.read_csv("predict-data.csv")
submission_df = shuffle(submission_df)[0:TOY_NUM]

# Create a merged data set and review initial information
combined_df = pd.concat([train_df, test_df])

# DATA EXPLORATION

# Quickly check for class imbalance
print(combined_df.describe())

# Check what the text looks like
print(combined_df.head(5))

# Get all the unique keywords
#print(combined_df["review"]-str.split.unique())

# Create small function to clean text
def text_clean(text):

    for element in ["http\S+", "RT ", "[^a-zA-Z\'\.\,\d\s]", "[0-9]","\t", "\n", "\s+", "<.*?>"]:
        text = re.sub("r"+element, " ", text)

    return text

# Clean data sets
combined_df.review = combined_df.review.apply(text_clean)
#test_df.review = test_df.review.apply(text_clean)
submission_df.review = submission_df.review.apply(text_clean)

# CORRECT SPELLING

# Instantiate spell checker
spell = SpellChecker()

# Correct spelling
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
# Spellcheck data sets
#train_df.text = train_df.text.apply(correct_spellings)
#val_df.text = val_df.text.apply(correct_spellings)

print('--------------- Vectorizing data ------------------')

start_vector_time = time.time()

#VECTORIZE the sentence
nlp = spacy.load('en_core_web_sm')

'''
nlp = spacy.load('en_core_web_sm', 
                exclude=['tagger','parser','ner','entity_linker','entity_ruler',
                'textcat','textcat_multilabel','lemmatizer', 'morphologizer',
                'attribute_ruler','senter','sentencizer','tok2vec','transformer'])
'''


# Embed sentences for the training set
X_train = []
for r in nlp.pipe(combined_df.review.values, disable=['parser','ner','entity_linker','entity_ruler',
                'textcat','textcat_multilabel','lemmatizer', 'morphologizer',
                'attribute_ruler','senter','sentencizer','tok2vec','transformer']):

    #print(f"{idx} out of {train_df.shape[0]}")
    emb = r.vector
    review_emb = emb.reshape(-1)
    X_train.append(review_emb)

X_train = np.array(X_train)
y_train = combined_df.sentiment.values

end_vector_time = time.time()

print(f'Vectorization Time taken in seconds : {end_vector_time - start_vector_time}')
print(f'Vectorization Time taken in minutes : {(end_vector_time - start_vector_time)/60}')

'''

# Embed sentences for the submission set
submission_data = []
for r in nlp.pipe(submission_df.review.values):
    emb = r.vector
    review_emb = emb.reshape(-1)
    submission_data.append(review_emb)

submission_data = np.array(submission_data)

'''
print('--------------- Training Data ------------------')


# LGBM

# Split data into train and testing data
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2, random_state = 42)

# Get the train and test data for the training sequence
train_data = lgbm.Dataset(X_train, label=y_train)
test_data = lgbm.Dataset(X_test, label=y_test)

# Parameters we'll use for the prediction
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'boosting': 'dart',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

# Train the classifier
classifier = lgbm.train(parameters,
                       train_data,
                       valid_sets= test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

'''

print('--------------- Prediction ------------------')

# PREDICTION
val_pred = classifier.predict(submission_data)

# Submission file
submission_df['sentiment_predicted'] = val_pred.round().astype(int)
submission_df.to_csv('submission_lgbm.csv', index=False)

'''
end_time = time.time()

print(f'Time taken in seconds : {end_time - start_time}')
print(f'Time taken in minutes : {(end_time - start_time)/60}')


#correct_pred_count =  sum(submission_df['sentiment'] == submission_df['sentiment_predicted'])
#print('The accuracy is : ', (100*correct_pred_count/submission_df.shape[0]))

In [None]:
import numpy as np
import pandas as pd
import re

import spacy
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from spellchecker import SpellChecker
import lightgbm as lgbm
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform


import tqdm
import time


start_time = time.time()

# LOAD DATA SETS
# Load the train, test and submission data frames

TOY_NUM = 200

print('--------------- Data preparation ------------------')

train_df = pd.read_csv("train-data.csv")
train_df = shuffle(train_df)[0:TOY_NUM]

test_df = pd.read_csv("test-data.csv")
test_df = shuffle(test_df)[0:TOY_NUM]

submission_df = pd.read_csv("predict-data.csv")
submission_df = shuffle(submission_df)[0:TOY_NUM]

# Create a merged data set and review initial information
combined_df = pd.concat([train_df, test_df])

# DATA EXPLORATION

# Quickly check for class imbalance
print(combined_df.describe())

# Check what the text looks like
print(combined_df.head(5))

print('--------------- Vectorizing data ------------------')

start_vector_time = time.time()

#VECTORIZE the sentence
nlp = spacy.load('en_core_web_sm')

# Embed sentences for the training set
X_train = []
for r in nlp.pipe(combined_df.review.values, disable=['parser','ner','entity_linker','entity_ruler',
                'textcat','textcat_multilabel','lemmatizer', 'morphologizer',
                'attribute_ruler','senter','sentencizer','tok2vec','transformer']):

    #print(f"{idx} out of {train_df.shape[0]}")
    emb = r.vector
    review_emb = emb.reshape(-1)
    X_train.append(review_emb)

X_train = np.array(X_train)
y_train = combined_df.sentiment.values

end_vector_time = time.time()

print(f'Vectorization Time taken in seconds : {end_vector_time - start_vector_time}')
print(f'Vectorization Time taken in minutes : {(end_vector_time - start_vector_time)/60}')

# LGBM

# Split data into train and testing data
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train,test_size=0.2, random_state = 42)

# Get the train and test data for the training sequence
train_data = lgbm.Dataset(X_train, label=y_train)
test_data = lgbm.Dataset(X_test, label=y_test)


fit_params={"early_stopping_rounds":30, 
            "eval_metric" : 'auc', 
            "eval_set" : [(X_test,y_test)],
            'eval_names': ['valid'],
            #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)],
            'verbose': 100,
            'categorical_feature': 'auto'}

param_test ={'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
			 'boosting_type':['gbdt','goss'],
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=5,
    refit=True,
    random_state=314,
    verbose=True)

gs.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))


clf_sw = lgb.LGBMClassifier(**clf.get_params())
#set optimal parameters
clf_sw.set_params(**gs.best_estimator_.get_params())
gs_sample_weight = GridSearchCV(estimator=clf_sw, 
                                param_grid={'scale_pos_weight':[1,2,6,12]},
                                scoring='roc_auc',
                                cv=5,
                                refit=True,
                                verbose=True)

gs_sample_weight.fit(X_train, y_train, **fit_params)
print('Best score reached: {} with params: {} '.format(gs_sample_weight.best_score_, gs_sample_weight.best_params_))



#Configure from the HP optimisation
clf_final = lgb.LGBMClassifier(**clf_sw.get_params())
res = clf_final.set_params(**gs_sample_weight.best_estimator_.get_params())
print(res)
def learning_rate_010_decay_power_099(current_iter):
	base_learning_rate = 0.1
	lr = base_learning_rate  * np.power(.99, current_iter)
	return lr if lr > 1e-3 else 1e-3

def learning_rate_010_decay_power_0995(current_iter):
	print(current_iter)

	base_learning_rate = 0.1
	lr = base_learning_rate  * np.power(.995, current_iter)
	return lr if lr > 1e-3 else 1e-3

def learning_rate_005_decay_power_099(current_iter):
	base_learning_rate = 0.05
	lr = base_learning_rate  * np.power(.99, current_iter)
	return lr if lr > 1e-3 else 1e-3

#Train the final model with learning rate decay
#clf_final.fit(X_train, y_train, **fit_params, 
#			callbacks=[lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_0995)])
clf_final.fit(X_train, y_train, **fit_params)

clf_final.booster_.save_model('final-model.txt')


end_time = time.time()

print(f'Time taken in seconds : {end_time - start_time}')
print(f'Time taken in minutes : {(end_time - start_time)/60}')

In [None]:
import numpy as np
import pandas as pd
import re

import spacy
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from spellchecker import SpellChecker
import lightgbm as lgbm

import tqdm
import time


start_time = time.time()

# LOAD DATA SETS
# Load the train, test and submission data frames

TOY_NUM = 200

print('--------------- Data preparation ------------------')

 
submission_df = pd.read_csv("predict-data.csv")
submission_df = shuffle(submission_df)[0:TOY_NUM]

# DATA EXPLORATION

# Quickly check for class imbalance
print(submission_df.describe())

# Check what the text looks like
print(submission_df.head(5))

print('--------------- Vectorizing data ------------------')

start_vector_time = time.time()

#VECTORIZE the sentence
nlp = spacy.load('en_core_web_sm')

'''
nlp = spacy.load('en_core_web_sm', 
                exclude=['tagger','parser','ner','entity_linker','entity_ruler',
                'textcat','textcat_multilabel','lemmatizer', 'morphologizer',
                'attribute_ruler','senter','sentencizer','tok2vec','transformer'])
'''


# Embed sentences for the prediction set
predict_data = []
for r in nlp.pipe(submission_df.review.values, disable=['parser','ner','entity_linker','entity_ruler',
                'textcat','textcat_multilabel','lemmatizer', 'morphologizer',
                'attribute_ruler','senter','sentencizer','tok2vec','transformer']):

    #print(f"{idx} out of {train_df.shape[0]}")
    emb = r.vector
    review_emb = emb.reshape(-1)
    predict_data.append(review_emb)

predict_data = np.array(predict_data)

end_vector_time = time.time()

print(f'Vectorization Time taken in seconds : {end_vector_time - start_vector_time}')
print(f'Vectorization Time taken in minutes : {(end_vector_time - start_vector_time)/60}')

print('--------------- Predicting Data ------------------')

bst = lgbm.Booster(model_file='final-model.txt')


# PREDICTION
val_pred = bst.predict(predict_data)

# Submission file
submission_df['sentiment_predicted'] = val_pred.round().astype(int)
submission_df.to_csv('submission_lgbm.csv', index=False)

end_time = time.time()

print(f'Time taken in seconds : {end_time - start_time}')
print(f'Time taken in minutes : {(end_time - start_time)/60}')

#correct_pred_count =  sum(submission_df['sentiment'] == submission_df['sentiment_predicted'])
#print('The accuracy is : ', (100*correct_pred_count/submission_df.shape[0]))