In [None]:
# All import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
from bs4 import BeautifulSoup
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pickle
from nltk.stem import PorterStemmer 
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
# pd.set_option('mode.chained_assignment', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
home_path = '/content/drive/MyDrive/crowdflower/'

In [None]:
df = pickle.load(open(home_path+'data/main_df.pkl','rb'))

In [None]:
df.head(2)

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,cleaned_query1,cleaned_query,cleaned_product_title,cleaned_product_description,ste_cleaned_product_title,ste_cleaned_product_description,product_description_len,product_title_len,query_product_title_words_overlap
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0,bridal shower decor,bridal_shower_decorations,accent pillow heart design red black,red satin accent pillow embroidered heart blac...,accent pillow heart design red black,red satin accent pillow embroid heart black th...,36,50,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0,led christma light,led_christmas_lights,set 10 battery operated multi led train christ...,set 10 battery operated train christmas lights...,set 10 batteri oper multi led train christma l...,set 10 batteri oper train christma light item ...,61,643,1.0


In [None]:
token_pattern = r"\b\w\w+\b"

def tokens(text):
    token = re.compile(token_pattern, flags = re.UNICODE)
    tokens = [x for x in token.findall(text)]
    return tokens

# Counting features

In [None]:
def total_word_count(row):
    
    return [len(row['cleaned_query1'].split()),len(row['ste_cleaned_product_title'].split()),len(row['ste_cleaned_product_description'].split())]

In [None]:
def commonwords(row):
    query = set(row['cleaned_query1'].split())
    title = set(row['ste_cleaned_product_title'].split())
    description = set(row['ste_cleaned_product_description'].split())
    
    return [len(query&title),len(title&description),len(description&query),len(query&title&description)]

In [None]:
def commonwords_ratio(row):
    query_words = row['cleaned_query1'].split()
    query_words_count = len(query_words)
    title_words = row['ste_cleaned_product_title'].split()
    title_words_count = len(title_words)
    description_words = row['ste_cleaned_product_description'].split()
    description_words_count = len(description_words)

    query = set(row['cleaned_query1'].split())
    title = set(row['ste_cleaned_product_title'].split())
    description = set(row['ste_cleaned_product_description'].split())
    
    return [len(query&title)/max(query_words_count,title_words_count),len(title&description)/max(title_words_count,description_words_count),
            len(description&query)/max(description_words_count,query_words_count),
            len(query&title&description)/max(query_words_count,title_words_count,description_words_count)]

In [None]:
def try_divide(x, y, val=0.0):
    """ 
    	Try to divide two numbers
    """
    if y != 0.0:
    	val = float(x) / y
    return val

In [None]:
def get_position_list(target, obs):
    """
        Get the list of positions of obs in target
    """
    pos_of_obs_in_target = [0]
    if len(obs) != 0:
        pos_of_obs_in_target = [j for j,w in enumerate(obs, start=1) if w in target]
        if len(pos_of_obs_in_target) == 0:
            pos_of_obs_in_target = [0]
    return pos_of_obs_in_target

In [None]:
def gen_ngrams(text,num):
    
    n_grams = ngrams(text,num)
    return [item for item in n_grams]
    

In [None]:
def extract_count_features():
    
    #Generate Unigram
    df["query_unigram"] = list(df.apply(lambda x: tokens(x["cleaned_query1"]), axis=1))
    df["title_unigram"] = list(df.apply(lambda x: tokens(x["ste_cleaned_product_title"]), axis=1))
    df["description_unigram"] = list(df.apply(lambda x: tokens(x["ste_cleaned_product_description"]), axis=1))
    
    print('*******'*10)
    print('Starting Bigrams')
    #Generate Bigram
    df["query_bigram"] = list(df.apply(lambda x: gen_ngrams(x["query_unigram"], 2), axis=1))
    df["title_bigram"] = list(df.apply(lambda x: gen_ngrams(x["title_unigram"], 2), axis=1))
    df["description_bigram"] = list(df.apply(lambda x: gen_ngrams(x["description_unigram"], 2), axis=1))
    
    print('Starting Trigrams')
    # Generate Trigram
    df["query_trigram"] = list(df.apply(lambda x: gen_ngrams(x["query_unigram"], 3), axis=1))
    df["title_trigram"] = list(df.apply(lambda x: gen_ngrams(x["title_unigram"], 3), axis=1))
    df["description_trigram"] = list(df.apply(lambda x: gen_ngrams(x["description_unigram"], 3), axis=1))
    
    feat_names = ["query", "title", "description"]
    grams = ["unigram", "bigram", "trigram"]
    
    print('Starting count feature')
    count_digit = lambda x: sum([1. for w in x if w.isdigit()])
    for feat_name in feat_names:
        for gram in grams:
            ## word count
            df["count_of_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(x[feat_name+"_"+gram]), axis=1))
            df["count_of_unique_%s_%s"%(feat_name,gram)] = list(df.apply(lambda x: len(set(x[feat_name+"_"+gram])), axis=1))
            df["ratio_of_unique_%s_%s"%(feat_name,gram)] = list(map(try_divide, df["count_of_unique_%s_%s"%(feat_name,gram)], df["count_of_%s_%s"%(feat_name,gram)]))

        ## digit count
        df["count_of_digit_in_%s"%feat_name] = list(df.apply(lambda x: count_digit(x[feat_name+"_unigram"]), axis=1))
        df["ratio_of_digit_in_%s"%feat_name] = list(map(try_divide, df["count_of_digit_in_%s"%feat_name], df["count_of_%s_unigram"%(feat_name)]))
    df["description_missing"] = list(df.apply(lambda x: int(x["description_unigram"] == ""), axis=1))
    print('Intersection features')
    for gram in grams:
        for obs_name in feat_names:
            for target_name in feat_names:
                if target_name != obs_name:
                    ## query
                    df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(df.apply(lambda x: sum([1. for w in x[obs_name+"_"+gram] if w in set(x[target_name+"_"+gram])]), axis=1))
                    df["ratio_of_%s_%s_in_%s"%(obs_name,gram,target_name)] = list(map(try_divide, df["count_of_%s_%s_in_%s"%(obs_name,gram,target_name)], df["count_of_%s_%s"%(obs_name,gram)]))

        ## some other feat
        df["title_%s_in_query_div_query_%s"%(gram,gram)] = list(map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s"%gram]))
        df["title_%s_in_query_div_query_%s_in_title"%(gram,gram)] = list(map(try_divide, df["count_of_title_%s_in_query"%gram], df["count_of_query_%s_in_title"%gram]))
        df["description_%s_in_query_div_query_%s"%(gram,gram)] = list(map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s"%gram]))
        df["description_%s_in_query_div_query_%s_in_description"%(gram,gram)] = list(map(try_divide, df["count_of_description_%s_in_query"%gram], df["count_of_query_%s_in_description"%gram]))
    print('starting with position features')
    for gram in grams:
        for target_name in feat_names:
            for obs_name in feat_names:
                if target_name != obs_name:
                    pos = list(df.apply(lambda x: get_position_list(x[target_name+"_"+gram], obs=x[obs_name+"_"+gram]), axis=1))
                    ## stats feat on pos
                    df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = list(map(np.min, pos))
                    df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = list(map(np.mean, pos))
                    df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = list(map(np.median, pos))
                    df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = list(map(np.max, pos))
                    df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = list(map(np.std, pos))
                    ## stats feat on normalized_pos
                    df["normalized_pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)] = list(map(try_divide, df["pos_of_%s_%s_in_%s_min" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]))
                    df["normalized_pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)] = list(map(try_divide, df["pos_of_%s_%s_in_%s_mean" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]))
                    df["normalized_pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)] = list(map(try_divide, df["pos_of_%s_%s_in_%s_median" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]))
                    df["normalized_pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)] = list(map(try_divide, df["pos_of_%s_%s_in_%s_max" % (obs_name, gram, target_name)], df["count_of_%s_%s" % (obs_name, gram)]))
                    df["normalized_pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] = list(map(try_divide, df["pos_of_%s_%s_in_%s_std" % (obs_name, gram, target_name)] , df["count_of_%s_%s" % (obs_name, gram)]))
           
    return df
    

In [None]:
out = extract_count_features()

**********************************************************************
Starting Bigrams


  after removing the cwd from sys.path.


Starting Trigrams
Starting count feature
Intersection features
starting with position features


In [None]:
df[['query_wc','product_title_wc','product_description_wc']] = df.apply(
    lambda x:total_word_count(x),axis = 1,result_type='expand')


df[['query_title_cwc','title_desc_cwc','desc_query_cwc','query_title_dec_cwc']] = df.apply(
    lambda x:commonwords(x),axis = 1,result_type='expand')

In [None]:
df[['query_title_cwc_ratio','title_desc_cwc_ratio','desc_query_cwc_ratio','query_title_dec_cwc_ratio']] = df.apply(
    lambda x:commonwords_ratio(x),axis = 1,result_type='expand')

In [None]:
df['global_average'] = np.mean(df.median_relevance)
df['global_std'] = np.std(df.median_relevance)

In [None]:
path = '/content/drive/MyDrive/crowdflower/data/'
df.to_csv(path+'count_features.csv',index=False)