In [1]:
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import sys
import keyboard
import json
import ast
import itertools
import numpy as np
from pandas.io.json import json_normalize
import pickle
import ipyplot

model_id = "rasta/distilbert-base-uncased-finetuned-fashion"
classifier = pipeline("text-classification", model=model_id)

def classify(text):
    preds = classifier(text, return_all_scores=True)
    if preds[0][0]['score']  <= preds[0][1]['score']:
        return "Not Fashion"
    else:
        return "Fashion"
    
def attribute_extraction(txt):
    tokenized = sent_tokenize(txt)

    attributes = []
    for i in tokenized:
        wordsList = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(wordsList)

    for i,w in enumerate(tagged) :
        if w[1] in ['NN','NNS','RB'] :
            ind =i 
            attr = w[0]
            while tagged[ind-1][1] in ['JJ','VBN','NN','RB','VBD','EX']:
                    attr = tagged[ind-1][0] + ' ' +  attr
                    ind = ind - 1
                    
            if len(attr.split())==1 and txt.split()[0].lower()=='will':
                attr = tagged[ind-1][0] + ' ' +  attr
                
            if classify(attr) == 'Fashion':
                attributes.append(attr)
            for a in attributes:
                for b in attributes:
                    if (a!=b) and (a in b):
                        attributes.remove(a)
                
            for a in attributes:
                if 'fit' in a :
                    attributes = list(map(lambda x: x.replace(a, a.replace(' fit','')), attributes))
                if 'match' in a :  
                    attributes = list(map(lambda x: x.replace(a, a.replace(' match','')), attributes))                                       
                
    return attributes        


posts = nltk.corpus.nps_chat.xml_posts()[:10000]

def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]

# 10% of the total data
size = int(len(featuresets) * 0.1)

# first 10% for test_set to check the accuracy, and rest 90% after the first 10% for training
train_set, test_set = featuresets[size:], featuresets[:size]

# get the classifer from the training set
classifiers = nltk.NaiveBayesClassifier.train(train_set)
# to check the accuracy - 0.67
# print(nltk.classify.accuracy(classifier, test_set))

question_types = ["whQuestion","ynQuestion"]
def is_ques_using_nltk(ques):
    question_type = classifiers.classify(dialogue_act_features(ques)) 
    return question_type in question_types


question_pattern = ["do i", "do you", "what", "who", "is it", "why","would you", "how","is there",
                    "are there", "is it so", "is this true" ,"to know", "is that true", "are we", "am i", 
                   "question is", "tell me more", "can i", "can we", "tell me", "can you explain",
                   "question","answer", "questions", "answers", "ask"]

helping_verbs = ["is","am","can", "are", "do", "does"]
# check with custom pipeline if still this is a question mark it as a question

def is_question(question):
    question = question.lower().strip()
    if not is_ques_using_nltk(question):
        is_ques = False
        # check if any of pattern exist in sentence
        for pattern in question_pattern:
            is_ques  = pattern in question
            if is_ques:
                break

        # there could be multiple sentences so divide the sentence
        sentence_arr = question.split(".")
        for sentence in sentence_arr:
            if len(sentence.strip()):
                # if question ends with ? or start with any helping verb
                # word_tokenize will strip by default
                first_word = nltk.word_tokenize(sentence)[0]
                if sentence.endswith("?") or first_word in helping_verbs:
                    is_ques = True
                    break
        return is_ques    
    else:
        return True
    
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer


model_semantick_id = "PriaPillai/distilbert-base-uncased-finetuned-query"
classifier_sem = pipeline("text-classification", model=model_semantick_id)


ps = PorterStemmer()
verb_pattern = [ps.stem(i) for i in ['match', 'suit', 'fit', 'wear', 'pair']]
# 'be', 'go', 'are'

def semantic_check_hard_coded(txt):
    tokenized = sent_tokenize(txt)
    verbs = []
    
    for i in tokenized:
        wordsList = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(wordsList)

    for i,w in enumerate(tagged) :
        if w[1] in ['VB','VBD','VBN','VBG','VBP','VBZ'] :
            verbs.append(ps.stem(w[0]))
    
    for v in verbs:
        if v in verb_pattern :
            return True
    return False

def semantic_check(text):
    if semantic_check_hard_coded(text):
        return True
    preds = classifier_sem(text, return_all_scores=True)
    if preds[0][0]['score']  <= preds[0][1]['score']:
        return True
    else:
        return False
    
def extraction_pipeline(query):
    if not is_question(query):
        message = "I am not understanding you, please enter a question that is related to fashion"
        return message, []
    elif not semantic_check(query) :
        message = "I am not sure to get your query can you please try again ?"
        return message, []
    else:
        return "Working ...",attribute_extraction(query)

In [2]:
frame = pd.read_csv('image_id.csv')
frame = frame.drop(columns=["Unnamed: 0"])
data = pd.read_csv("data.csv")

def sample(x):
    return data["Attributes"][x]

def extract_from_sample(i):
    dic = eval(sample(i))
    a = [dic[k]['attrs'] for k in dic.keys()]

    occur = [] 
    for i,obj in enumerate(a):
        sent =  ' '.join([d[0] for d in obj] ) + ' ' + list(dic.keys())[i]
        occur.append(sent)
        
    return occur


def extract_image(attr1,attr2,k):
    match = []
    a = 0
    for i,d in enumerate(data['Attributes']):
        l = extract_from_sample(i)
        if (attr1 in l) and (attr2 in l):
            match.append(list(frame[frame['id']==i]['URL'])[0])
            a = a + 1
            if a == k:
                break
    
    if len(match)>=1:
        ipyplot.plot_images(match, max_images=20, img_width=150, show_url=False)
    else :
        print("No image found")
    
    return match

from simcse import SimCSE

model_SIMCSE = SimCSE("princeton-nlp/sup-simcse-roberta-large")

with open('index.pkl', 'rb') as f:
    index = pickle.load(f)
    
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

model_SIMCSE.index = index
items = index['sentences']

def similar_items(attr):
    similar_items=[]
    results = model_SIMCSE.search(attr,top_k=10,threshold=0.779)
    for i in range(len(results)):
        similar_items.append(results[i][0])
    return similar_items

matrix = pd.read_csv('Final_co-occurence_polyvore_Adel.csv')

def matrix_search_advice(attr,k):
    match = []
    i = 0
    append = True
    
    for a in matrix['bigram'] :
        if attr in a :
            a = tuple(a[1:-1].replace('\'','').split(", "))

            if attr in a[0] :
                wrd = a[1]
            else:
                wrd = a[0]
                
            remove = False
            if (not wrd in match) and (not attr in wrd): 
                        match.append(wrd)
                        i = i + 1
            if i == k:
                break
                
    return match,i

def matrix_search_match(attr,k):
    match = []
    i = 0
    append = True
    
    for a in matrix['bigram'] :
        if attr in a :
            a = tuple(a[1:-1].replace('\'','').split(", "))

            if attr in a[0] :
                wrd = a[1]
            else:
                wrd = a[0]
                
            remove = False
            if (not wrd in match) and (not attr in wrd): 
                    for el in match : 
                        if model_SIMCSE.similarity(el,wrd) > 0.7:
                            remove = True
                    if not remove:
                        match.append(wrd)
                        i = i + 1
            #if i == k+3:
            if i == k:
                break
                
#     for el1 in match:
#         for el2 in match:
#             if el1 != el2 :
#                 if model_SIMCSE.similarity(el1,el2) > 0.7:
#                     match.remove(el2)
    return match,i

def garment_matching(attr,k):           # Returns k best matches to the given attribute
    
    attr = " ".join([lemmatizer.lemmatize(i) for i in attr.split()])
    i= 0
    match = []

    if attr in items:
        if k == 5 :
            match,i = matrix_search_match(attr,k)
        if k == 10 :
            match,i = matrix_search_advice(attr,k)

    else :
        similar = similar_items(attr)
        stop = False
        ind = 0
        while (not stop) and (ind < len(similar)):
            print(len(similar))
            if similar[ind] in items:
                if k == 5:
                    match,i = matrix_search_match(similar[ind],k)
                if k == 10:
                    match,i = matrix_search_advice(similar[ind],k)
                if (i>0):
                    stop = True 
                    attr = similar[ind]
            ind = ind + 1

    if (i==0):
        message = 'This attribute was not found for the garment matching try another attribute!'
        return message,[]
        
    return attr,match

def garment_advice(attr1 , attr2, k=10):
    match = []
    
    i = 0
    attr1, match = garment_matching(attr1,k)
    
    #attr2 = " ".join([lemmatizer.lemmatize(i) for i in attr2.split()])
    
    if match is None :
        return attr1,None, False
    
    if attr2 in match:
        return attr1,attr2,True
    else:
        for el in match:
            if model_SIMCSE.similarity(el,attr2) > 0.9 :
                return attr1,el,True
    
    return None, None, False

def check_image(num, attr1, attr2):
    
    bound = eval(data['boudaries(X,y,Width,Height)'][num])
    if (attr1 in bound.keys()) and (attr2 in bound.keys()):
        x1,y1,x2,y2 = bound[attr1]
        a1,b1,a2,b2 = bound[attr2]
        
        percentage1 = ( ((x2-x1)/6) + ((y2-y1)/6) ) / 2
        percentage2 = ( ((a2-a1)/6) + ((b2-b1)/6) ) / 2
        
        center1 = np.array([ x1 + (x2-x1)/2 , y1 + (y2-y1)/2])
        center2 = np.array([ a1 + (a2-a1)/2 , b1 + (b2-b1)/2])
        
        dist = np.linalg.norm(center1 - center2)
        
#         print(percentage1, percentage2, dist)
#         print(center1, center2)
        
        if percentage1 < 20 or percentage2 < 20:
            return False
        else :
            return True
    else:
        #print("One of the attributes is not found in the image")
        return False
    
def new_extract_image(attr1,attr2, k):
    match = []
    a = 0
    for i,d in enumerate(data['Attributes']):
        l = extract_from_sample(i)
        if (attr1 in l) and (attr2 in l) and check_image(i, attr1, attr2):
            match.append(list(frame[frame['id']==i]['URL'])[0])
            a = a + 1
            if a == k:
                break
    #else :
        #print("No image found")
    
    return match

In [None]:
from ipynb.fs.full import config as keys
#import config as keys
from telegram.ext import *


print("Bot started....")

def start_command(update, context): #Defines what happens when bot is started
    update.message.reply_text('Welcome to fashion advisor \n Please enter your query.')

def end_to_end(update, context):
    
    query = update.message.text
    msg, attr = extraction_pipeline(query)
    
    
    if attr is None:
        update.message.reply_text("An unknown problem occured please contact the support.")
    
    URL = []
    
    if len(attr) == 1 :         # garment matching
        attr0,match = garment_matching(attr[0],5)
        update.message.reply_text(attr[0].capitalize() +' will match with the following attributes: ')
        update.message.reply_text(match)
        update.message.reply_text('\nHere are some images of your item with some good matches:\n')

        for item in match:
            URL = URL + new_extract_image(attr0, item,1)
            URL = list(dict.fromkeys(URL))

        chat_id = update.message.chat_id  

        for i in range(len(URL)):            
            context.bot.sendPhoto(chat_id=chat_id, photo=URL[i])


    elif len(attr) == 2 :         #garment advice
        attr1,attr2,g = garment_advice(attr[0] , attr[1], 10)
        if g == True:
            update.message.reply_text(attr[0].capitalize()+ ' would be a good match with '+ attr[1])
            update.message.reply_text('\nHere are some images of that combo: ')
            URL = new_extract_image(attr1, attr2,5)
            URL = list(dict.fromkeys(URL))
                   
            chat_id = update.message.chat_id  
        
            for i in range(len(URL)):            
                context.bot.sendPhoto(chat_id=chat_id, photo=URL[i])            

                
        elif attr1 is None:
            update.message.reply_text("Those items are not commonly worn together !")
        
        else:
            update.message.reply_text(attr1)
            
            
    elif len(attr) == 0 :
        update.message.reply_text(msg)
        return None
    
    else:
        update.message.reply_text('More than 2 attributes were detected, this version only support 1 attribute for garment matching and 2 for garment advice')

        
        
def help_command(update, context): #To give specific instructions to user
    update.message.reply_text('This bot can give you advice about what to wear with a cloath you give him (garment advice) or can tell you if 2 items are a good fit together (garment matching)  ')
    update.message.reply_text('Here are 2 exemples of the queries: \n  - garment advice: What can I wear with a blue pant ? \n  - garment matching: Can I wear a blue pant with a white shirt ? ')
    
def error(update, context):
    print(f"Update {update} caused error {context.error}")

def main():
    updater = Updater(keys.API_KEY, use_context=True)
    dp = updater.dispatcher

    dp.add_handler(CommandHandler("start", start_command))
    dp.add_handler(CommandHandler("help", help_command))
    dp.add_handler(MessageHandler(Filters.text, end_to_end))
    dp.add_error_handler(error)

    updater.start_polling()
    updater.idle()

main()


06/24/2022 19:02:30 - INFO - apscheduler.scheduler -   Scheduler started


Bot started....


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.65it/s]


2
2


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.26it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.29it/s]
100%|███████████████████████████████████

2
2


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.95it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.85it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.29it/s]
100%|███████████████████████████████████

3


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.00it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.20it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.44it/s]
100%|███████████████████████████████████

2
2


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.78it/s]
100%|███████████████████████████████████

2
2


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.22it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.27it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.35it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.95it/s]
100%|███████████████████████████████████