In [1]:
# importing libraries
import pandas as pd
from nltk.corpus import stopwords
import joblib
from sklearn.model_selection import train_test_split
import string
from sklearn.metrics import classification_report
import re
import numpy as np
import random
from sklearn.metrics import confusion_matrix
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
joblib.__version__

'1.1.0'

In [3]:
import lightgbm

lightgbm.__version__

'3.3.3'

In [1]:
# installing libraries
!pip install -q lightgbm # lgbm model
!pip install -q sentence-transformers # sentence-transformer

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
# intializing for parallel computing of embedding 
os.environ["TOKENIZERS_PARALLELISM"] = "True"

In [45]:
# using all-MiniLM-L6-v2 model which has overall 58 rogue-L score with 384 dim output size with
# max length 128

model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

In [46]:
# read test data

# Do not change this cell
test_path="/home/jovyan/input/data/test.parquet"

test_data = pd.read_parquet(test_path)

In [47]:
# Note: This below feature helped to increase score by 0.0020-> 
# from 0.9522 to 0.9538

def get_method_name(code:str)->str:
    """Extracting method name from method name as a feature"""
    
    return code.split(":")[0]

In [48]:
test_data['method_feature'] = test_data['code'].apply(get_method_name)

In [49]:
STOP_WORDS = stopwords.words("english")

# some custom stopwords depending on the analysis what we captured
custom_stowords_list = ['def','self','returns','return',
                        'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz',
                       'zzzzzzzzzz','zzzzzzzz','zzzzzzz','zzzzz',
                       'aaaabnzacyceaaaadaqabaaabaqddurxdpyhoquyodutxjduzqmjyjqjszt',
       'aaaabnzacyceaaaadaqabaaabaqdcgxehzf', 'aaaabnzacyceaaa',
       'aaaabnzacyce', 'aaaabnzacyc', 'aaaabnzacy',
       'aaaabnzackcmaaacbakdpkarimlm', 'aaaabbbccdaabbb', 'aaaabbbcca',
       'aaaabaaacaaadaaaeaaafaaagaaahaaaiaaajaaakaaalaaamaaanaaaoaaapaaaqaaaraaasaaataaa',
       'aaaaargh', 'aaaaabaaa', 'aaaaabaa',
       'aaaaaaeceeeeiiiidnoooooouuuuysaaaaaaaceeeeiiii', 'aaaaaabaedaa',
       'aaaaaaaarge', 'aaaaaaaalaaaaaaadwpwaaaaaaaaaa',
       'aaaaaaaalaaaaaaadwpqzmzmzmdk', 'aaaaaaaadjuqwsqicqdwlclimq',
       'aaaaaaaaaaagaagaaagaaa', 'aaaaaaaaaaaahaaaaaaaaa',
       'aaaaaaaaaaaaaaaaaadwpwaaaaaaapc',
       'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
       'aaaaaaaaaaaaaaaa', 'aaaaaaaaaaaaaaa', 'aaaaaaaaaaaaaa',
       'aaaaaaaaaaaaa', 'aaaaaaaaaaaa', 'aaaaaaaaaaa', 'aaaaaaaaaa',
       'aaaaaaaaa', 'aaaaaaaa', 'aaaaaaa', 'aaaaaa', 'aaaaa', 'aaaa',
       'aaa', 'aa']

In [50]:
STOP_WORDS.extend(custom_stowords_list)

In [51]:

def preprocess(x):
    """removing stopwords and numeric from string """
    x = str(x).lower()
    y = []
    for zz in x.split():
        if zz.lower() not in STOP_WORDS or zz.lower() not in custom_stowords_list:
            y.append(zz)
    x = " ".join(y)        
    x = re.sub(r"([0-9]+)","", x)
    
    return x

def clean_data(lines):
    """" Method return cleaned string"""
    cleaned = []
    for line in lines.split("\n"):
        clean = re.sub(r"""
               [,.;@#?!&$()|^<='\\_`:>"%/{}*]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               " ",          # and replace it with a single space
               line, flags=re.VERBOSE)
        # clean = re.sub(r"[^a-zA-Z0-9 ]+","",clean)
        #Manually handle cases not accepted by sub
        clean = clean.replace("[", "")
        clean = clean.replace("+", "")
        clean = clean.replace("]", "")
        clean = clean.replace("-", "")
        # tokenize on white space
        line = clean.split()
        # convert to lower case
        line = [word.lower() for word in line]
        # store as string
        cleaned.append(' '.join(line))
    # remove empty strings
    return " ".join(cleaned)

In [52]:
def pre_processing_pipeline(df):
    """ pre-processing pipeline to clean the string """

    def convert_html_text(text):
        """ function convert http link to text"""

        if 'https://' in text:
            for pattern in string.punctuation:
                text = text.replace(pattern," ")

            return text
        else:
            return text
    
    
    def extract_features(df):
        # preprocessing each question
        print(f"[Info] Running cleaning pipeline!")
        
        df["code"] = df["code"].fillna("").apply(clean_data)
        df["docstring"] = df["docstring"].fillna("").apply(clean_data)
        df["method_feature"] = df["method_feature"].fillna("").apply(clean_data)
        
        print(f"[Info] Cleaning done!")
        
        return df
    
    ##conveting hyperlinks to string format
    df['docstring'] = df['docstring'].apply(
        convert_html_text
    )
    
    df = extract_features(df)
    
    print(f"[Info] Running stopwords removal pipeline!")
    
    df['code'] = df['code'].apply(preprocess)
    df['docstring'] = df['docstring'].apply(preprocess)
    df["method_feature"] = df["method_feature"].apply(preprocess)
    
    print(f"[Info] Stopwords removal complete!")
    
    return df
    

In [53]:
# running pre-porcessing pipeline
%time

test_data = pre_processing_pipeline(test_data)

CPU times: user 18 µs, sys: 0 ns, total: 18 µs
Wall time: 59.6 µs
[Info] Running cleaning pipeline!
[Info] Cleaning done!
[Info] Running stopwords removal pipeline!
[Info] Stopwords removal complete!


In [54]:
from collections import Counter
import math

# This below similarity feature increased the score by 0.9
# https://stackoverflow.com/questions/15173225/calculate-cosine-similarity-given-2-sentence-strings

def counter_cosine_similarity(c1,c2):
    """method return cosine similarity between two strings"""
    
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k,0)*c2.get(k,0) for k in terms)
    magA = math.sqrt(sum(c1.get(k,0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k,0)**2 for k in terms))
    
    try:
        return dotprod/(magA*magB)
    except:
        return 0
  

In [55]:
%time

test_data['sim_score'] = test_data.apply(lambda x: 
                               counter_cosine_similarity(Counter(x['code'].split(" ")),
                                                        Counter(x['docstring'].split(" "))),
                               axis=1
                              )

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs


## Calculating embedding of the features

In [4]:
def get_sentence_embedding(list_of_sentences:pd.Series)->np:
    """ method return array of sentence embedding """
    
    feature_embedding = model.encode(list_of_sentences,
                                     show_progress_bar=True
                                    )
    
    return feature_embedding

In [None]:
code_embedding_ = get_sentence_embedding(test_data.code.values)
code_embedding_feature = get_sentence_embedding(test_data.method_feature.values)
doc_embedding_ = get_sentence_embedding(test_data.docstring.values)

In [56]:
# # code features
# code_embeds = test_data.code.values
# code_embedding_ = model.encode(code_embeds,show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [57]:
# # code method feature
# code_feature = test_data.method_feature.values
# code_embedding_feature = model.encode(code_feature,show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [58]:
# # docsting feature
# doc_embeds = test_data.docstring.values
# doc_embedding_ = model.encode(doc_embeds,show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
import numpy

def data_prepare(code_embedding_feature:numpy,
                doc_embedding_:numpy,
                code_embedding_:numpy,
                test_data:pd.DataFrame())->numpy:
    
    # concatenating our code, docsting, code method feature in one single numpy array
    train_c = np.concatenate([code_embedding_feature,
                              doc_embedding_,
                              code_embedding_],
                             axis=1)
    
    # using our features and converting it to numpy to add in embedding features
    features_array = test_data[['sim_score']].to_numpy()
    
    # concatenating similarity feature to embedding feature
    train_c_with_features = np.concatenate(
        [train_c,features_array],
        axis=1)
    
    return train_c_with_features
    
    

In [59]:
# # concatenating our code, docsting, code method feature in one single numpy array
# train_c = np.concatenate([code_embedding_feature,
#                           doc_embedding_,
#                           code_embedding_],axis=1)

In [60]:
# using our features and converting it to numpy to add in embedding features
# features_array = test_data[['sim_score']].to_numpy()

In [61]:
# # concatenating similarity feature to embedding feature
# train_c_with_features = np.concatenate(
#     [train_c,features_array],
#     axis=1)

In [66]:
# loading our fine-tuned lgbm model
lgb_model = joblib.load('lgbm_sentence_model_whole_data_with_sim_2000.pkl')

In [None]:
def get_prediction(train_c_with_features):
    #use loaded model to to get predictions
    pred_prob = lgb_model.predict(train_c_with_features)
    # probability
    pred_class = (pred_prob >=0.5)*1
    test_data['y_pred'] = pred_class
    submission_file = test_data[['id','y_pred']] # only the id and y_pred to be part of output
    
    return submission_file

In [None]:
submission_file = get_prediction(train_c_with_features)

In [63]:
# #use loaded model to to get predictions
# pred_prob = lgb_model.predict(train_c_with_features)

In [64]:
# # probability
# pred_class = (pred_prob >=0.5)*1
# test_data['y_pred'] = pred_class
# submission_file = test_data[['id','y_pred']] # only the id and y_pred to be part of output

In [65]:
#final cell needs to create an output csv file with id and y_pred columns
submission_file.to_csv('lgbm_sentence_model_whole_data_with_sim_2000_result.csv', 
                       index = False) # index = False is required