In [132]:
import numpy as np
import pandas as pd
import csv
import nltk
from pymystem3 import Mystem

In [133]:
#Load food offers dictionaries
df_buy_food = pd.read_csv("cafe.csv")
df_order_food = pd.read_csv("food_order.csv")
df_buy_or_order_goods_dict = pd.read_csv("goods.csv")

#Create dictionary with intents related to food
food_dicts = {'buy_food': df_buy_food, 'order_food': df_order_food, 'buy_or_order_goods': df_buy_or_order_goods_dict}

#Load sport offers dictionaries
df_buy_sportswear = pd.read_csv("sportswear_brands.csv")
df_buy_equipment = pd.read_csv("sports_equipment.csv")
df_buy_sport_food = pd.read_csv("sports_nutrition.csv")
df_rent_equipment = pd.read_csv("sports_equipment_rental.csv")
df_get_train = pd.read_csv("services_of_instructors.csv")
df_get_service = pd.read_csv("services_of_clubs.csv")

#Create dictionary with intents related to sport
sport_dicts = {'buy_sportswear': df_buy_sportswear, 'buy_equipment': df_buy_equipment, 'buy_sport_food': df_buy_sport_food, 
               'rent_equipment': df_rent_equipment, 'get_train': df_get_train, 'get_service': df_get_service}

In [134]:
#Function for data processing
def processing_intents_dicts(dictionary):
    for f in dictionary.keys():
        #lemmatize words in query
        current_df = dictionary.get(f)
        vals = np.array(current_df['NAME'])
        lemmatized = []
        m = Mystem()
        for word in vals:
            str_lem = ""
            for lem_word in m.lemmatize(word)[:-1]:
                str_lem += lem_word
            lemmatized.append(str_lem)
            
            
        #Get unique words                
        series = pd.Series(lemmatized).str.lower().unique()
        df_new = pd.DataFrame({'NAME': series})
        df_new.to_csv("new/" + f + "_new.csv", index=False)

In [135]:
#Processing dictionaries using helper function
processing_intents_dicts(food_dicts)
processing_intents_dicts(sport_dicts)

In [136]:
#Load food offers dictionaries
df_buy_food = pd.read_csv("new/buy_food_new.csv")
df_order_food = pd.read_csv("new/order_food_new.csv")
df_buy_or_order_goods_dict = pd.read_csv("new/buy_or_order_goods_new.csv")

#Load sport offers dictionaries
df_buy_sportswear = pd.read_csv("new/buy_sportswear_new.csv")
df_buy_equipment = pd.read_csv("new/buy_equipment_new.csv")
df_buy_sport_food = pd.read_csv("new/buy_sport_food_new.csv")
df_rent_equipment = pd.read_csv("new/rent_equipment_new.csv")
df_get_train = pd.read_csv("new/get_train_new.csv")
df_get_service = pd.read_csv("new/get_service_new.csv")

#Create final food and sport dictionaries
food_d = {'buy_food': df_buy_food, 'order_food': df_order_food, 'buy_or_order_goods': df_buy_or_order_goods_dict}
sport_d = {'buy_sportswear': df_buy_sportswear, 'buy_equipment': df_buy_equipment, 'buy_sport_food': df_buy_sport_food, 
               'rent_equipment': df_rent_equipment, 'get_train': df_get_train, 'get_service': df_get_service}

In [137]:
#Function for preclassification process using prepared dictionaries
def intent_preclassification(query):
    words = query.split(' ')
    lem_words = []
    lem_bigrm = []
    lem_trigrm = []
    
    w = Mystem()
    # Create bag of words in query
    for word in words:
        if len(w.lemmatize(word)) > 0:
            lem_word = w.lemmatize(word)[0]
        else:
            lem_word = ""
        lem_words.append(lem_word)
        
    bigrm = list(nltk.bigrams(lem_words))
    
    # Create bag of bigramms in query
    for pair in bigrm:
        str_bigr = pair[0] + " " + pair[1]
        lem_bigrm.append(str_bigr)
        
    trigrm = list(nltk.trigrams(lem_words))
    
    # Create bag of trigramms in query
    for triple in trigrm:
        str_trpl = triple[0] + " " + triple[1] + " " + triple[2]
        lem_trigrm.append(str_trpl)
    
    probs = {'buy_sportswear': 0, 'buy_equipment': 0, 'buy_sport_food': 0, 
               'rent_equipment': 0, 'get_train': 0, 'get_service': 0, 'buy_food' : 0, 
             'order_food': 0, 'buy_or_order_goods': 0 }
    
    def set_probs(list_of_words, dictionary):
        for word in list_of_words:
            for key in dictionary:
                curr_dict = dictionary.get(key)
                terms = np.array(curr_dict['NAME'])
                if word in terms:
                    probs[key] = probs.get(key) + 1
                    
    for dictionary in [sport_d, food_d]:
        set_probs(lem_words, dictionary)
        set_probs(lem_bigrm, dictionary)
        set_probs(lem_trigrm, dictionary)

    freq_intent = []
    max_val = max(probs.values())
    for key in probs:
        if (probs.get(key) == max_val) and (max_val != 0):
            freq_intent.append(key)
    if len(freq_intent) == 0:
        freq_intent.append('not_found')
    return freq_intent

In [144]:
#Get the list of intents, which parse well using preclassifier and save them to file
queries = pd.read_csv("queries_final.csv")
queries_arr = queries.QUERY.values
results_array = []
queries_array = []
bad_results_array = []
bad_queries_array = []
for query in queries_arr:
    if (len(intent_preclassification(query)) == 1) and (intent_preclassification(query)[0] != 'not_found'):
        queries_array.append(query)
        results_array.append(intent_preclassification(query))
    elif (len(intent_preclassification(query)) == 2):
        queries_array.append(query)
        results_array.append(intent_preclassification(query)[0])
    elif (len(intent_preclassification(query)) > 1) or (intent_preclassification(query)[0] == 'not_found'):
        bad_queries_array.append(query)
        bad_results_array.append(intent_preclassification(query))
#print(results_array)
series1 = pd.Series(results_array)
series2 = pd.Series(queries_array)
df_new = pd.DataFrame({'QUERY': series2, 'INTENT': series1})
df_new.to_csv("intents_by_preclassifier.csv", index=False)

series3 = pd.Series(bad_results_array)
series4 = pd.Series(bad_queries_array)
df_new = pd.DataFrame({'QUERY': series3, 'INTENT': series4})
df_new.to_csv("bad_intents.csv", index=False)