In [1]:
import pandas as pd

df = pd.read_csv("Dataset-SA.csv")
df

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral
...,...,...,...,...,...,...
205047,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,must buy!,good product,positive
205048,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,super!,nice,positive
205049,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,3,nice,very nice and fast delivery,positive
205050,cello Pack of 18 Opalware Cello Dazzle Lush Fi...,1299,5,just wow!,awesome product,positive


In [2]:
import pandas as pd
import re
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import Levenshtein as lev
import spacy

# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load spaCy model for semantic similarity
nlp = spacy.load("en_core_web_md")

# Initialize text processing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Text preprocessing function
def preprocess_text(text, for_semantic=False):
    if not isinstance(text, str):
        return ""
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    if for_semantic:
        # Only lemmatization for semantic processing
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    else:
        # Both lemmatization and stemming for other similarities
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        tokens = [stemmer.stem(word) for word in tokens]
    
    return " ".join(tokens)

# Apply preprocessing
df['product_name_processed'] = df['product_name'].apply(preprocess_text)
df['summary_processed'] = df['Summary'].apply(preprocess_text)
df['product_name_semantic'] = df['product_name'].apply(lambda x: preprocess_text(x, for_semantic=True))
df['summary_semantic'] = df['Summary'].apply(lambda x: preprocess_text(x, for_semantic=True))



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/skillissue/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/skillissue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/skillissue/nltk_data...


In [3]:
def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 0

def cosine_sim(text1, text2):
    vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), token_pattern=None)
    corpus = [text1, text2]
    try:
        vectors = vectorizer.fit_transform(corpus)
        return cosine_similarity(vectors[0], vectors[1])[0][0]
    except:
        return 0.0

def levenshtein_similarity(text1, text2):
    max_len = max(len(text1), len(text2))
    if max_len == 0:
        return 1.0
    distance = lev.distance(text1, text2)
    return 1 - (distance / max_len)

def semantic_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)



In [None]:
# Calculate all similarities
df['lexical_similarity'] = df.apply(
    lambda x: jaccard_similarity(x['product_name_processed'], x['summary_processed']), axis=1)

df['jaccard_similarity'] = df['lexical_similarity']  # same as lexical in this implementation

df['cosine_similarity'] = df.apply(
    lambda x: cosine_sim(x['product_name_processed'], x['summary_processed']), axis=1)

df['levenshtein_similarity'] = df.apply(
    lambda x: levenshtein_similarity(x['product_name_processed'], x['summary_processed']), axis=1)

df['semantic_similarity'] = df.apply(
    lambda x: semantic_similarity(x['product_name_semantic'], x['summary_semantic']), axis=1)



  return doc1.similarity(doc2)


In [None]:
# Display results
print(df[['product_name', 'Summary', 
          'lexical_similarity', 'semantic_similarity',
          'jaccard_similarity', 'cosine_similarity',
          'levenshtein_similarity']].head())