**Members**:
	
  *A.V.N.M.Hemateja*
	
  *S.Abishek Sriram*

**College:**
	
*Vellore Institute of Technology, Chennai*


# Myntra Beauty Product Recommendation System

In [2]:
#connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Code works without an error when we use tensorflow above 2 version
import tensorflow as tf

In [4]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE

# Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/Myntra_Hackerramp/Beauty_Recommendation/Beauty_Recommendation_Dataset/cosmetics_final.csv')

# if the user picks Moisturizer and user has dry skin
# Filtering for moisturizers
moisturizers = df.query('Label =="Moisturizer"')

# Filtering for dry skin
moisturizers_dry = moisturizers.query('Dry==1')

# Reseting index
moisturizers_dry = moisturizers_dry.reset_index(drop=True)

In [5]:
#This is the Ideal Product which is recommended by experts based on the ingredients required for that particular skin type
#We are adding this to the dataframe so that we can plot this also as a point in the grid to find similar products with these ingredients
moisturizers_dry.loc[len(moisturizers_dry.index)] = ["Required_Ingredients","","",0,5,"Water,Sodium Methyl Cocoyl Taurate,Cocamidopropyl Betaine,Peg-2 Laurate,Glycol Distearate,Propylene Glycol Laurate,Glycerin,Dipropylene Glycol,Polyquaternium-10,Phenoxyethanol",0,1,0,0,0] 

In [6]:
# Initializing dictionary, list, and initial index
ingredient_idx = {}
corpus = []
idx = 0

# Loop to tokenize the ingredients
for i in range(len(moisturizers_dry)):    
    ingredients = moisturizers_dry['Ingredients'][i]
    ingredients_lower = ingredients.lower()
    tokens = ingredients_lower.split(', ')
    corpus.append(tokens)
    for ingredient in tokens:
        if ingredient not in ingredient_idx:
            ingredient_idx[ingredient] = idx
            idx += 1

In [7]:
# Storing the number of items and ingredients in the dataframe
M = moisturizers_dry['Name'].shape[0]
N = len(ingredient_idx)

# Initialize a matrix with zeros for filling them
A = np.zeros((M,N))

In [8]:
# Defining the Encoding function for converting all ingredients to numbers associated using ingredient_idx dictionary
def Encode(tokens):
    x = np.zeros(N)
    for ingredient in tokens:
        # Get the index for each ingredient
        idx = ingredient_idx[ingredient]
        # Put 1 at the corresponding indices
        x[idx] =1 
    return x

In [9]:
# Making the Document Term Matrix (DTM)
i = 0
for tokens in corpus:
    A[i, :] = Encode(tokens)
    i+=1

In [10]:
# Reducing the dimension using TSNE model(T-distributed Stochastic Neighbor Embedding)
# initializing the model with required arguments and also providing metric for distance(euclidean) - sqrt((x2-x1)^2 + (y2-y1)^2)
model = TSNE(n_components= 2, learning_rate=200, random_state=42,metric='euclidean')
#transforming the A matrix to 2D features
tsne_features = model.fit_transform(A)

# Creating X and Y columns to store the X-axis and Y-axis values for getting similar (nearest) points 
moisturizers_dry['X'] = tsne_features[:,0]
moisturizers_dry['Y'] = tsne_features[:,1]

In [12]:
#Defining function for sorting list consisting of tuple (like (var1,var2)) according second variable
def sortSecond(val):
    return val[1]

In [13]:
#importing math library for finding euclidean distance
import math
#Getting the X and Y values of reference product recommeded by expert (Required_Ingredients)
Reference_X_Y = moisturizers_dry[moisturizers_dry['Label'] == "Required_Ingredients"]
Reference_X=Reference_X_Y.X.values[0]
Reference_Y=Reference_X_Y.Y.values[0]
#initializing list called Sorted to store the products in order to their similarity (most similar first)
Sorted = []
#Calculating the euclidean distance
for index,row in moisturizers_dry.iterrows():
  print("X: "+str(row['X'])+" Y: "+str(row['Y']))
  Distance = math.sqrt(math.pow((row['X']-Reference_X),2)+pow((row['Y']-Reference_Y),2))
  print(Distance)
  Sorted.append((row,Distance))

#sorting using defined sortSecond function
Sorted.sort(key = sortSecond)


X: -125.35101318359375 Y: -7.813470363616943
123.61286979812894
X: 10.184649467468262 Y: -7.930853366851807
13.737447358624433
X: -16.75743865966797 Y: 87.28614044189453
89.92721504212393
X: 130.52320861816406 Y: 21.786396026611328
134.44306036095094
X: 96.94706726074219 Y: -49.529319763183594
109.94296199126016
X: 2.619774103164673 Y: 70.5044174194336
72.05255061050569
X: 26.26410484313965 Y: 37.246620178222656
47.82763194451433
X: -11.77316951751709 Y: -22.065658569335938
22.895790474271127
X: -38.95707702636719 Y: -5.123808860778809
37.23881036881226
X: -84.35816192626953 Y: -71.31967163085938
108.10440545149808
X: 1.5356570482254028 Y: 79.21450805664062
80.69382611984528
X: -8.346846580505371 Y: 35.45491027832031
37.41964703443794
X: -45.023887634277344 Y: -45.19704818725586
61.45694556943447
X: 71.14991760253906 Y: 16.310516357421875
75.17177397972328
X: -122.00334167480469 Y: -2.677401065826416
120.10574896919425
X: 67.94290924072266 Y: -26.570524215698242
74.24211079424586
X: -2

In [14]:
Sorted

[(Label                                       Required_Ingredients
  Brand                                                           
  Name                                                            
  Price                                                          0
  Rank                                                           5
  Ingredients    Water,Sodium Methyl Cocoyl Taurate,Cocamidopro...
  Combination                                                    0
  Dry                                                            1
  Normal                                                         0
  Oily                                                           0
  Sensitive                                                      0
  X                                                       -1.90432
  Y                                                       -1.40596
  Name: 166, dtype: object, 0.0),
 (Label                                                Moisturizer
  Brand                     

In [15]:
#installing transformers for sentiment analysis using BERT
!pip install transformers
!pip install tensorflow
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

#initializing the base pretrained classifier
SentimentClassifier = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
#initializing the base pretrained tokenizer 
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
SentimentClassifier.summary()


Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 8.8 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 5.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 42.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [16]:
#these are the reviews just for testing the classification (this will not be present in the final myntra code)
pred_sentences = ['This was an awesome product. I bought it twice my time using this beautiful product if I have known it was this good','One of the worst product of all time, i am unable to understand why i bought this',"The color looks amazing on all indian skin tones. Me & my mom - we both use the same shade. I'm fair and she is wheatish yet this color suits both of us. Specially when paired with blue & yellow it just rocks. I assure you will stand out of the croud after using this. Iv been in love with the shade for 3 years now ","Wow I loved this shade and like the brand also.... And my first expeiance with myntra was fabulous good packaging and in time delivery.. Thank u so much myntra","In love with the colour. It's a pretty fuchsia pink that would suit almost all skin tones. As the name suggests, it is creamy matte in formula. Happy with the purchase.","I loved the color of lipstick it's super brighter and shiner too...Without any doubt bought it." ,"I loved so much colour,texture & creamy matte formula of lipstick...", "Little damaged from top side","Very nice colour..go for it","It's like red and pink mix Russia I love it","Wow so good nyc" ,"The lipstick is broken ...color was good but bar was broken...I want to change it.","Not good","I ordered this lipstick to gift someone. I opened it just yesterday..that was so in bad condition..it was broken..bad experience", "So worst quality . I didnt expect this cheap quality frm Maybelline like big brand. Its just like local market 20 rs. type lipstick. Literally soooooo HORRIBLE","Old product. Expiry is in next year Apr. 50% shelf life is gone."]


In [17]:
#randomly shuffling the reviews
import random
random.shuffle(pred_sentences)

In [18]:
#taking the sample to test the system from the Sorted list (from TSNE similarity)
sample = Sorted[1:9]

In [19]:
len(sample)
i=0
count=0
#Assigning 2 reviews for each product
while(i+1<(8*2)):
  reviews={"Reviews": [pred_sentences[i],pred_sentences[i+1]]}
  temp=list(sample[count])
  temp.append(reviews)
  sample[count] = temp
  count+=1
  i+=2

In [20]:
#Loading the weights for Sentiment Analysis that we trained and saved
SentimentClassifier.load_weights("/content/drive/MyDrive/Myntra_Hackerramp/Beauty_Recommendation/Sentiment_Analysis_Weights/SentimentAnalysis.hdf5")

In [21]:
import tensorflow as tf
# initializing ScoreGenerated for storing the products in sorted order (High score first)
# Formula:  CriticalScore = ConfidenceScore*10000 + ((Rating*1000)-(Similarity*1000)) 
# Here: ConfidenceScore - Probability of SentimentClassifier Whether it is Positive or Negative
# Here: Rating - Overall Rating of product in the website (between 0 and 5)
# Here: Similarity - Distance between this product and reference product  
ScoreGenerated = []
for k in range(0,len(sample)):
  #Criticals=[]
  CriticalScore=0
  RatingandSimilarity=sample[0][0][4]*1000-sample[k][1]*1000
  tf_batch = tokenizer(sample[k][2]['Reviews'], max_length=128, padding=True, truncation=True, return_tensors='tf')
  tf_outputs = SentimentClassifier(tf_batch)
  tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
  labels = ['Negative','Positive']
  label = tf.argmax(tf_predictions, axis=1)
  label = label.numpy()
  for i in range(len(sample[k][2]['Reviews'])):
    if(labels[label[i]] == "Positive"):
      print("Points: "+str(int(tf_predictions[i][label[i]] * 10000)))
      CriticalScore+=int(tf_predictions[i][label[i]] * 10000)
      print(sample[k][2]['Reviews'][i], ": \n", labels[label[i]],"\n Confidence : ",tf_predictions[i][label[i]])
    else:
      print("Points: "+str((-1)*int(tf_predictions[i][label[i]]*10000)))
      CriticalScore+=(-1)*int(tf_predictions[i][label[i]]*10000)
      print(sample[k][2]['Reviews'][i], ": \n", labels[label[i]],"\n Confidence : ",tf_predictions[i][label[i]])
  ScoreGenerated.append((CriticalScore+RatingandSimilarity, sample[k]))


Points: 6330
Little damaged from top side : 
 Positive 
 Confidence :  tf.Tensor(0.63301265, shape=(), dtype=float32)
Points: 5904
Old product. Expiry is in next year Apr. 50% shelf life is gone. : 
 Positive 
 Confidence :  tf.Tensor(0.59043205, shape=(), dtype=float32)
Points: 9985
I loved the color of lipstick it's super brighter and shiner too...Without any doubt bought it. : 
 Positive 
 Confidence :  tf.Tensor(0.99858034, shape=(), dtype=float32)
Points: -9945
I ordered this lipstick to gift someone. I opened it just yesterday..that was so in bad condition..it was broken..bad experience : 
 Negative 
 Confidence :  tf.Tensor(0.9945831, shape=(), dtype=float32)
Points: 9992
This was an awesome product. I bought it twice my time using this beautiful product if I have known it was this good : 
 Positive 
 Confidence :  tf.Tensor(0.9992362, shape=(), dtype=float32)
Points: 9984
In love with the colour. It's a pretty fuchsia pink that would suit almost all skin tones. As the name sugg

In [22]:
# Defining sorting function for sorting the list using first variable
def sortwithFirst(val):
    return val[0] 
#Sorting using the defined function
ScoreGenerated.sort(key=sortwithFirst,reverse=True)

In [None]:
ScoreGenerated