In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
from nltk.stem import WordNetLemmatizer

In [3]:
# Used to install wordnet lemmatizer
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

In [2]:
from sklearn.feature_extraction.text import CountVectorizer 
import numpy as np
import pandas as pd
import nltk
import os
import string

from tqdm import tqdm


In [5]:
import time, sys
from IPython.display import clear_output
def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

# Loading in necessary files to obtain priority threshold 

In [6]:
filtered_df = pd.read_csv("/kaggle/input/nlp-customdataset/filtered_df.csv")
filtered_df.replace(to_replace = np.nan, value = "", inplace=True)

In [7]:
msg_useful_terms_without_digit = pd.read_csv("/kaggle/input/nlp-customdataset/msg_useful_terms_without_digit.csv")
msg_useful_terms_without_digit.replace(to_replace = np.nan, value = "", inplace=True)

In [8]:
thread_term_weights = pd.read_csv("/kaggle/input/nlp-customdataset/thread_term_weights.csv")
thread_term_weights.replace(to_replace = np.nan, value = "", inplace=True)

In [9]:
thread_weights = pd.read_csv("/kaggle/input/nlp-customdataset/thread_weights.csv")
thread_weights.replace(to_replace = np.nan, value = "", inplace=True)

In [10]:
def get_weights(search_term, weight_df, term=True):
    if (len(search_term)>0):
        if term:
            term_match = False
            for search_item in search_term:
                match = weight_df["term"] == search_item
                term_match = term_match | match
        else:
            term_match = weight_df.subject.str.contains(search_term, regex=False)
        
        match_weights = weight_df.weight[term_match]
        if len(match_weights)<1:
            return 1
        else:
            return match_weights.mean()
    else:
        return 1

In [11]:
def rank_message(msg):        
    # Then, from thread activity
    is_thread = len(msg.subject.split('re ')) > 1
    if is_thread:
        subject = msg.subject.split('re ')[1]
        msg_thread_activity_wt = get_weights(subject, thread_weights, term=False)
    else:
        msg_thread_activity_wt = 1
    
    try:
        vec = CountVectorizer()
        sub_vec = vec.fit_transform([msg['subject']])
        msg_thread_terms = vec.get_feature_names_out()
        msg_thread_term_wt = get_weights(msg_thread_terms, thread_term_weights)
    except:
        # Some subjects from the test set result in empty vocabulary
        msg_thread_term_wt = 1
    
    try:
        vec = CountVectorizer()
        msg_vec = vec.fit_transform([msg['content']])
        msg_terms = vec.get_feature_names_out()
        msg_terms_wt = get_weights(msg_terms, msg_useful_terms_without_digit)
    except:
        # Some subjects from the test set result in empty vocabulary
        msg_terms_wt = 1
    
    # Calculating Rank
    rank = float(msg_thread_activity_wt) * float(msg_thread_term_wt) * float(msg_terms_wt)
    
    return rank, msg.subject

# Begin training process

In [12]:
rank_dict = {
    'rank': [],
    'subject': [],
}

for i in range(filtered_df.shape[0]):
    result = rank_message(filtered_df.iloc[i, :])
    rank_dict['rank'].append(result[0])
    rank_dict['subject'].append(result[1])
    update_progress(i/filtered_df.shape[0])

Progress: [####################] 100.0%


In [13]:
rank_df = pd.DataFrame.from_dict(rank_dict)

In [14]:
rank_df.to_csv("rank_df.csv", index = False)

In [3]:
rank_df = pd.read_csv("/kaggle/input/rank-dataset/rank_df.csv")
rank_df.replace(to_replace = np.nan, value = "", inplace=True)

In [4]:
rank_df

Unnamed: 0,rank,subject
0,8.435921,hilcorp old ocean
1,9.517106,meter concorde churchill
2,7.505679,revision sea robin volume
3,11.429807,natural gas nomination
4,10.416552,november prelim wellhead production estimate
...,...,...
16540,7.046366,proposed decision
16541,8.433988,etc event schlitterbahn
16542,10.972719,el paso start line conversion
16543,8.541704,re eog pronghorn location


As you can see, in the end, we obtain a threshold of 9.009694517030455

In [6]:
priority_threshold = rank_df['rank'].median()
print(priority_threshold)

9.009694517030455
