In [None]:
import spacy
import numpy as np
import pandas as pd

In [None]:
# !python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

In [None]:
# Load pre-trained NER model
nlp = spacy.load('en_core_web_lg')

In [None]:
nlp

<spacy.lang.en.English at 0x7923629fba60>

#EXAMPLE OF NAME ENTITY:

In [None]:
doc = nlp("Donad Trump was President of USA")

In [None]:
doc

Donad Trump was President of USA

In [None]:
type(doc)

spacy.tokens.doc.Doc

In [None]:
doc.ents

(Donad Trump, USA)

In [None]:
doc.ents[0], type(doc.ents[0])

(Donad Trump, spacy.tokens.span.Span)

In [None]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

###Loading our own data

In [None]:
df = pd.read_csv('/content/final_df_export.csv')

In [None]:
df = df[['combined_review_corrected']]

In [None]:
df.head()

Unnamed: 0,combined_review_corrected
0,effective never thought something could work u...
1,love malar real sense blends skin make skin su...
2,amazing packs wow using almost days love nut h...
3,heavenly smell smells amazing moisturizing gre...
4,spa like feel lovely face pack quite versatile...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 1 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   combined_review_corrected  1050 non-null   object
dtypes: object(1)
memory usage: 8.4+ KB


In [None]:
df.isna().sum()

Unnamed: 0,0
combined_review_corrected,9


In [None]:
df.dropna(inplace = True)

In [None]:
df.isna().sum()

Unnamed: 0,0
combined_review_corrected,0


In [None]:
df.shape

(1050, 1)

In [None]:
# Remove duplicate rows
df = df.drop_duplicates()

# Optionally, reset the index if you want a clean index after dropping duplicates
df = df.reset_index(drop=True)

In [None]:
df.shape

(1021, 1)

### Named entity Recognition - spacy inbuilt


In [None]:
df['combined_review_corrected'][0]

'effective never thought something could work underarms  product wonders happy results within two weeks'

In [None]:
doc = nlp(df['combined_review_corrected'][678])

In [None]:
doc.ents

(months,)

In [None]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [None]:
# def visualize_entities(df, column):
#     for i, review in enumerate(df[column]):
#         doc = nlp(review)
#         print(f"Review {i+1}:")
#         displacy.render(doc, style="ent", jupyter=True)

In [None]:
# # Apply the visualization to the 'combined_review_corrected' column
# visualize_entities(df, 'combined_review_corrected')

### Custom NER annotations

In [None]:
product_category_df = pd.read_csv('/content/prduct_category.csv')

In [None]:
product_category = product_category_df['product_category'].tolist()

In [None]:
len(product_category)

60

In [None]:
product_category.extend(["malar","cream","shampoo","soap","mask","lepa","malai","lip butter","mura","ubtan","candle","oil","multani","gel","tikta","maahu","conditioner","salt","balm","soap","jal","datun","gulal"])

In [None]:
len(product_category)

83

In [None]:
product_category = list(set(product_category))

In [None]:
len(product_category)

82

In [None]:
product_category

In [None]:
body_parts = ["face", "hair","scalp","skin","eyes",'nose',"lips","ears","hands","feet","men","underarms","legs",'neck']

In [None]:
organisation = ["nut habit","nat habit","not habit"]

In [None]:
seasons = ["winter","summer","monsoon","autum","spring"]

In [None]:
features = ['soft',"smooth", "refreshing","smells good","fragrance",'dandruff',"fall","loss","worth","glow","dryness","spammer","broken","harasser","wonderfull","qaulity","quantity"]

In [None]:
# Define a function to automatically tag entities in each review
def annotate_entities(text):
    entities = []

     # Search for organisation in the text
    for feat in features:
        start = text.lower().find(feat.lower())
        if start != -1:
            end = start + len(feat)
            entities.append((start, end, "FEATURES"))



     # Search for organisation in the text
    for season in seasons:
        start = text.lower().find(season.lower())
        if start != -1:
            end = start + len(season)
            entities.append((start, end, "SEASON"))

    # Search for organisation in the text
    for org in organisation:
        start = text.lower().find(org.lower())
        if start != -1:
            end = start + len(org)
            entities.append((start, end, "ORGANISATION"))

    # Search for products in the text
    for product in product_category:
        start = text.lower().find(product.lower())
        if start != -1:
            end = start + len(product)
            entities.append((start, end, "PRODUCT"))

    # Search for body parts in the text
    for part in body_parts:
        start = text.lower().find(part.lower())
        if start != -1:
            end = start + len(part)
            entities.append((start, end, "BODY_PART"))

    return (text, {"entities": entities})

In [None]:
# Apply the annotation function to each review in the DataFrame
df = df['combined_review_corrected'].apply(annotate_entities)

In [None]:
# Convert the result to a list for training
train_data = list(df)

In [None]:
for item in train_data:
  print(item)

In [None]:
train_data[375]

('awesome  really loved comb dangles pretty well smells good using comb never going back plastic ones ends dry anymore using comb best one ',
 {'entities': [(47, 58, 'FEATURES'), (22, 26, 'PRODUCT')]})