To attain features from the input data we must first create a function to extract a column from the csv file:

In [1]:
import time as t
#27486
sample_limit = 100
start = t.time()

#Import pandas library for data handling.
import pandas as pd

def get_data_from_column(column_title,file):
    #Variable to hold the location of input data.
    data_dir = "/kaggle/input/tweet-sentiment-extraction/"
    #Create the path to the file needed.
    file_path = data_dir + file
    
    #Open the csv file using pandas.
    data_file = pd.read_csv(file_path)
    #Extract the selected column from the csv file.
    data_column = data_file[column_title]
    
    #Return the column data.
    return data_column


Now we can extract data from any of the csv files, we now need to create a function to produce the N-grams for each of the tweets:

In [2]:
import nltk
import numpy as np

def generate_ngrams(tweet):
    #Generates ngrams in a tuple list
    tuple_grams = list(nltk.everygrams(str(tweet).split(' '), 1, len(str(tweet))))
    #Converts tuple list to a vertical numpy array
    ngrams = np.array([' '.join(i) for i in tuple_grams])[:, None]  
    return np.concatenate(ngrams)

Now that the ngrams can be produced, we can now write the feature extraction functions:

A function to encode sentiment infomation to numerical:

In [3]:
def encode_sentiment(sentiment):
    if sentiment == "positive":
        return 1
    if sentiment == "neutral":
        return 0
    if sentiment == "negative":
        return -1

A function to extract the sentiment scores from a string of characters:

In [4]:
import sys
sys.path.append("/kaggle/input/vader-sentiment/")

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Create an instance of the SentimentIntensityAnalyzer.
analyser = SentimentIntensityAnalyzer()

def extract_sentiments_from_text(text):
    sentiment_list = []
    
    #Calculate the sentiment scores for the ngram.
    sentiment_scores = analyser.polarity_scores(text)
    
    #Append each of the sentiment scores to the feature list
    for label in list(sentiment_scores):
        sentiment_list.append(sentiment_scores[label])
        
    return sentiment_list

A function to extract the POS infomation about an ngram:

In [5]:
def get_tag_list():
    tagdict = nltk.load('help/tagsets/upenn_tagset.pickle')
    taglist = tagdict.keys()
    return list(taglist)

tag_list = get_tag_list()

def get_POS_info(ngram):
    tag_count = [0]* len(tag_list)
    
    tagged = nltk.pos_tag(nltk.word_tokenize(ngram))
    
    for tag in tagged:
        if tag[1] in tag_list:
            tag_count[tag_list.index(tag[1])] += 1
     
    return tag_count

In [6]:
def find_all_caps_count(text):
    return sum(map(str.isupper, text.split()))

def find_punctuation_count(text):
    mark_count = []
    mark_lookup = ['!','?','#']
    
    for mark in mark_lookup:
        mark_count.append(text.count(mark))
        
    return mark_count


BERT Implementation for feature extraction:

In [7]:
import torch
import transformers as ppb # pytorch transformers

In [8]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, '/kaggle/input/distilbertbaseuncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Now we can write a function to collate and compile all the features for one ngram:

In [9]:
def default_features(text):
    def_features = []
    def_features.append(len(text))
    def_features += extract_sentiments_from_text(text)
    def_features.append(find_all_caps_count(text))
    def_features += find_punctuation_count(text)
    return def_features

In [10]:
def extract_features_from_ngram(tweet, tweet_sentiment, ngram):
    #List to hold features.
    ngram_features = []
    
    ngram_features += default_features(tweet)
    ngram_features.append(encode_sentiment(tweet_sentiment))

    ngram_features += default_features(ngram)
    ngram_features.append(len(ngram)/len(tweet))    

    return ngram_features

Feature order is as follows:

* Tweet Length
* Tweet Overall Sentiment
* Tweet Negative Sentiment Score
* Tweet Neutral Sentiment Score
* Tweet Positive Sentiment Score
* Tweet Compound Sentiment Score
* Tweet Punctuation Count
* Tweet All Caps Count 

* N-gram Length
* N-gram to Tweet Length Ratio
* N-gram Negative Sentiment Score
* N-gram Neutral Sentiment Score
* N-gram Positive Sentiment Score
* N-gram Compound Sentiment Score
* Tweet Punctuation Count
* Tweet All Caps Count 

This set of features is extracted for each of the N-grams from one tweet:

In [11]:
def extract_features_from_tweet(tweet,tweet_sentiment):
    #Array to store all tweet features.
    tweet_features = []
    
    #Generate all N-grams for the tweet.
    tweet_ngrams = generate_ngrams(str(tweet))
    
    #Loop through all the N-grams.
    for ngram in tweet_ngrams:
        #Extract the features from each N-gram.
        ngram_features = extract_features_from_ngram(str(tweet),tweet_sentiment,str(ngram))
        #Append the N-gram features to the tweet features.
        tweet_features.append(ngram_features)
            
    return np.array(tweet_features)

Now that we can extract all the features we need from a tweet, we must now calculate the label for each N-gram; the jaccard similarity:

Code for jaccard similarity calculation:

In [12]:
def jaccard(str1, str2):
    a = set(str(str1).lower().split())
    b = set(str(str2).lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

Now we can write a function to calculate the jaccard similarity for each of the N-grams in a tweet:

In [13]:
def get_ngram_jaccard_labels(tweet, tweet_selected_text):
    #List to hold ngram jaccard scores.
    labels = []
    
    #Generate all the ngrams of this tweet.
    ngrams = generate_ngrams(tweet)
    #Loop through the ngrams generated
    for ngram in ngrams:
        #Calculate the jaccard score for each ngram.
        jaccard_score = jaccard(ngram,tweet_selected_text)
        #Append it to the labels list.
        labels.append(jaccard_score)
    
    return np.array(labels)

Now that we can extract both the features and labels from a tweet we can write a function to collate all the features and labels from all of the tweets:

In [14]:
def get_all_features(tweets,sentiments): 
    #List to hold all the features of the tweets
    features = []
    
    #Iterate the variable i to the length of the tweets.
    for i in range(0,len(tweets)):
        #Extract the features from a tweet given its string and sentiment.
        tweet_features = extract_features_from_tweet(tweets[i],sentiments[i])
        #Append features to total features.
        features.append(tweet_features)

    return np.array(features)

In [15]:
def get_all_labels(tweets,selected_texts):
    #List to hold labels for all the tweets' ngrams
    labels = []
    
    #Iterate the variable i to the length of the tweets.
    for i in range(0,len(tweets)):
        #Get the label for all ngrams for a tweet given its string and target string.
        tweet_labels = get_ngram_jaccard_labels(tweets[i],selected_texts[i])
        #Append labels to total labels.
        labels.append(tweet_labels)
        
    return np.asanyarray(labels)

Then we can bring these two functions together when fitting the regression model:

Load CSV data for training:

In [16]:
tweets = get_data_from_column("text","train.csv")
sentiments = get_data_from_column("sentiment","train.csv")
selected_texts = get_data_from_column("selected_text","train.csv")

Extract training features:

In [17]:
total_features = np.concatenate(get_all_features(tweets, sentiments))

Get training labels:

In [18]:
total_labels = np.concatenate(get_all_labels(tweets,selected_texts))

Train LightGBM model:

In [19]:
import lightgbm as lgb

#Compile a training dataset
train_data = lgb.Dataset(total_features, label=total_labels)

#Set the model's parameters
num_round = 10
param = {'num_leaves': 500, 'objective': 'huber','boosting': 'dart'}

#Train the model
bst = lgb.train(param, train_data, num_round)
bst.save_model('model.txt')

<lightgbm.basic.Booster at 0x7f4b941b6080>

Now that our model is fitting with the training data we can apply it to the test data:

In [20]:
#Create a list of all ngrams for all tweets.
def get_test_ngrams(tweets):
    #List to hold all ngrams.
    test_ngrams = []
    #Loop through all tweets.
    for tweet in tweets:
        #Generate the ngrams for that tweet.
        ngram = generate_ngrams(tweet)
        #Append to that to the total list.
        test_ngrams.append([ngram])
        
    return np.array(test_ngrams)


Load testing data:

In [21]:
test_tweets = get_data_from_column("text","test.csv")
test_sentiments = get_data_from_column("sentiment","test.csv")

Extract features:

In [22]:
test_features = get_all_features(test_tweets,test_sentiments)

In [23]:
bst = lgb.Booster(model_file='model.txt')  # init model


#Generate test ngrams.
test_ngrams = get_test_ngrams(test_tweets)
#List to hold all resulting predicted texts.
selected_texts = []

#Iterate variable i to the length of the test tweets list
for i in range(0,len(test_tweets)):
    
    #Produce a list of jaccard score predictions for each ngram feature list for each tweet.
    prediction = bst.predict(test_features[i], num_iteration=bst.best_iteration)
    
    #Get the index in that list of the maximum score.
    best_prediction_index = list(prediction).index(max(list(prediction)))
    
    #Get the ngram with the maximum score.
    selected_text = test_ngrams[i][0][best_prediction_index]
    #Append it to the total list.
    selected_texts.append(selected_text)

In [24]:
import csv
#Converting the selected texts list to numpy array.
selected_texts = np.asanyarray(selected_texts)

textIDs = get_data_from_column("textID","sample_submission.csv")
textIDs = textIDs[:len(selected_texts)]
#Create a dataframe from the numpy array using IDs as the index.
selected_texts = pd.DataFrame(selected_texts,index=textIDs,columns=["selected_text"])
#Output to CSV
selected_texts.to_csv('submission.csv')

print(selected_texts)
end = t.time()
print(str(end-start) + " Seconds")

                                                selected_text
textID                                                       
f87dea47db  Last session of the day  http://twitpic.com/67ezh
96d74cb729                                    really exciting
eee518ae67                                             shame!
01082688c6                                              happy
33987a8ee5                                          like it!!
...                                                       ...
e5f0e6ef4b                                              tired
416863ce47  Thanks for the net which keeps me alive and ki...
6332da480c                                      depression...
df1baec676                                               love
469e15c5a8                                               cute

[3534 rows x 1 columns]
1354.8018107414246 Seconds
