In [1]:
from tqdm import tqdm
import pandas as pd
import os
import pickle
from time import sleep
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open('query_dict/id_querry_spelled.pickle', 'rb') as file:
    id_querry_clean = pickle.load(file)
with open('title_dict/title_data.pickle', 'rb') as file:
    title_data = pickle.load(file)

In [3]:
df_marks = pd.read_csv('train.marks.tsv', delimiter='\t', header=None)
df_marks = df_marks.rename(columns={0: "QueryId", 1: "DocumentId"})
df_marks = df_marks.drop(columns=[2])
df_example = pd.read_csv('sample.csv')
all_groups = df_marks.append(df_example)
all_groups = all_groups.reset_index()
all_groups = all_groups.drop(columns=['index'])
all_groups = all_groups.sort_values(by=['QueryId', 'DocumentId']).reset_index()
all_groups = all_groups.drop(columns=['index'])
rev_frame = all_groups.sort_values(by=['DocumentId', 'QueryId']).reset_index()
rev_frame = rev_frame.drop(columns=['index'])

In [4]:
all_groups

Unnamed: 0,QueryId,DocumentId
0,0,1443
1,0,5912
2,0,5963
3,0,6096
4,0,6230
...,...,...
606045,6310,497350
606046,6310,512263
606047,6310,529857
606048,6310,551291


In [5]:
title_classic_feats = dict()
title_classic_feats['un_title_loc_tfidf_vectorizer_1'] = []
title_classic_feats['un_title_loc_tfidf_vectorizer_2'] = []
title_classic_feats['un_title_loc_tfidf_vectorizer_3'] = []
title_classic_feats['un_title_loc_tfidf_vectorizer_234'] = []
title_classic_feats['un_title_loc_tfidf_vectorizer_12345'] = []
title_classic_feats['un_title_loc_tfidf_vectorizer_ch_1_7'] = []
title_classic_feats['un_title_loc_tfidf_vectorizer_ch_8_15'] = []
#################################
title_classic_feats['un_title_loc_tf_vectorizer_1'] = []
title_classic_feats['un_title_loc_tf_vectorizer_2'] = []
title_classic_feats['un_title_loc_tf_vectorizer_3'] = []
title_classic_feats['un_title_loc_tf_vectorizer_234'] = []
title_classic_feats['un_title_loc_tf_vectorizer_12345'] = []
title_classic_feats['un_title_loc_tf_vectorizer_ch_1_7'] = []
title_classic_feats['un_title_loc_tf_vectorizer_ch_8_15'] = []

In [6]:
def calc_cos_sim(vect, corp, query):
    x = vect.fit_transform(corp)
    q = vect.transform([query])
    return list(cosine_similarity(x,q).flatten())

In [7]:
q_ids = all_groups['QueryId'].unique()
for q1 in tqdm(q_ids):
    cur_t = []
    cur_q = id_querry_clean[q1]
    tmp = all_groups[all_groups.QueryId==q1]
    docs = tmp['DocumentId'].values
    for doc1 in docs:
        cur_t.append(title_data[doc1])
    tfidf_vectorizer_1 = TfidfVectorizer(ngram_range=(1,1))
    tfidf_vectorizer_2 = TfidfVectorizer(ngram_range=(2,2))
    tfidf_vectorizer_3 = TfidfVectorizer(ngram_range=(3,3))
    tfidf_vectorizer_234 = TfidfVectorizer(ngram_range=(2,4))
    tfidf_vectorizer_12345 = TfidfVectorizer(ngram_range=(1,5))
    tfidf_vectorizer_ch_1_7 = TfidfVectorizer(ngram_range=(1,7), analyzer='char_wb')
    tfidf_vectorizer_ch_8_15 = TfidfVectorizer(ngram_range=(8,15), analyzer='char_wb')
    ####################
    tf_vectorizer_1 = TfidfVectorizer(ngram_range=(1,1), use_idf=False)
    tf_vectorizer_2 = TfidfVectorizer(ngram_range=(2,2), use_idf=False)
    tf_vectorizer_3 = TfidfVectorizer(ngram_range=(3,3), use_idf=False)
    tf_vectorizer_234 = TfidfVectorizer(ngram_range=(2,4), use_idf=False)
    tf_vectorizer_12345 = TfidfVectorizer(ngram_range=(1,5), use_idf=False)
    tf_vectorizer_ch_1_7 = TfidfVectorizer(ngram_range=(1,7), analyzer='char_wb', use_idf=False)
    tf_vectorizer_ch_8_15 = TfidfVectorizer(ngram_range=(8,15), analyzer='char_wb', use_idf=False)
    
    title_classic_feats['un_title_loc_tfidf_vectorizer_1'].extend(calc_cos_sim(tfidf_vectorizer_1, cur_t, cur_q))
    title_classic_feats['un_title_loc_tfidf_vectorizer_2'].extend(calc_cos_sim(tfidf_vectorizer_2, cur_t, cur_q))
    title_classic_feats['un_title_loc_tfidf_vectorizer_3'].extend(calc_cos_sim(tfidf_vectorizer_3, cur_t, cur_q))
    title_classic_feats['un_title_loc_tfidf_vectorizer_234'].extend(calc_cos_sim(tfidf_vectorizer_234, cur_t, cur_q))
    title_classic_feats['un_title_loc_tfidf_vectorizer_12345'].extend(calc_cos_sim(tfidf_vectorizer_12345, cur_t, cur_q))
    title_classic_feats['un_title_loc_tfidf_vectorizer_ch_1_7'].extend(calc_cos_sim(tfidf_vectorizer_ch_1_7, cur_t, cur_q))
    title_classic_feats['un_title_loc_tfidf_vectorizer_ch_8_15'].extend(calc_cos_sim(tfidf_vectorizer_ch_8_15, cur_t, cur_q))
    #################################
    title_classic_feats['un_title_loc_tf_vectorizer_1'].extend(calc_cos_sim(tf_vectorizer_1, cur_t, cur_q))
    title_classic_feats['un_title_loc_tf_vectorizer_2'].extend(calc_cos_sim(tf_vectorizer_2, cur_t, cur_q))
    title_classic_feats['un_title_loc_tf_vectorizer_3'].extend(calc_cos_sim(tf_vectorizer_3, cur_t, cur_q))
    title_classic_feats['un_title_loc_tf_vectorizer_234'].extend(calc_cos_sim(tf_vectorizer_234, cur_t, cur_q))
    title_classic_feats['un_title_loc_tf_vectorizer_12345'].extend(calc_cos_sim(tf_vectorizer_12345, cur_t, cur_q))
    title_classic_feats['un_title_loc_tf_vectorizer_ch_1_7'].extend(calc_cos_sim(tf_vectorizer_ch_1_7, cur_t, cur_q))
    title_classic_feats['un_title_loc_tf_vectorizer_ch_8_15'].extend(calc_cos_sim(tf_vectorizer_ch_8_15, cur_t, cur_q))

100%|██████████| 6311/6311 [14:07<00:00,  7.45it/s]


In [8]:
with open('un_basic_tf_idf_title_feats.pickle', 'wb') as file:
    pickle.dump(title_classic_feats, file)