In [1]:
%%writefile ../src/aspects.py
"""
aspects.py
- Classifies reviews by simple aspects using keywords and asigns polarity per rating (heuristic).
"""

#comment to test overwritting
from __future__ import annotations
import pandas as pd
import numpy as np
from typing import Dict, List

DEFAULT_ASPECTS: Dict[str, List[str]] = {
    "battery": ["battery", "batería", "bateria", "charge", "duración", "dura", "power"],
    "price":   ["price", "precio", "cost", "value", "vale", "caro", "barato"],
    "quality": ["quality", "calidad", "build", "material", "acabado"],
    "durability": ["durable", "durabilidad", "last", "romp", "months", "años"],
    "shipping": ["shipping", "envío", "enviado", "paquetería", "llegó", "entrega"]
}

def tag_aspects(corpus: pd.DataFrame, aspects: Dict[str, List[str]] = None) -> pd.DataFrame:
    """Add boolean columns per aspect if the chunk mentions some keyword."""
    if aspects is None:
        aspects = DEFAULT_ASPECTS
    df = corpus.copy()                         #copy original DF
    text = df["chunk"].str.lower().fillna("")  #take text and convert to lowercase and fill NaN with ""
    for a, kws in aspects.items():             
        df[a] = text.apply(lambda t: any(kw in t for kw in kws)) #.apply applies function lambda 
    return df                                                    # returns True if a keyword appears like subchain in the review

def sentiment_from_rating(rating: float) -> int:
    """Heuristic: >=4 positive (+1), <=2 negative (-1), 3 neutral (0), NaN -> 0."""
    try:
        r = float(rating)
    except Exception:
        return 0
    if r >= 4:
        return 1
    if r <= 2:
        return -1
    return 0

#return how many reviews contains the aspect 
def aggregate_aspects(df_tagged: pd.DataFrame, product: str = None) -> pd.DataFrame:
    """Adds polarity per aspect (mean) and coverage (% of chunks tagged)."""
    df = df_tagged.copy()
    if product is not None:
        df = df[df["product"] == product]
    df["pol"] = df["rating"].apply(sentiment_from_rating)

    rows = []
    #detecting which columns are aspects: must exists as keys on DEFAULT_ASPECTS
    aspect_cols = [c for c in df.columns if c in DEFAULT_ASPECTS.keys()] 
    for a in aspect_cols:
        sub = df[df[a]] #just the chunks where the aspect a==True (mentioned)
        if len(sub) == 0:
            rows.append({"aspect": a, "coverage": 0.0, "mean_polarity": 0.0, "n": 0})
        else:
            rows.append({
                "aspect": a,                                              #aspect name
                "coverage": round(len(sub) / max(1, len(df)) * 100, 2),   # % of coverage
                "mean_polarity": round(sub["pol"].mean(), 3),             # average sentiment
                "n": len(sub)                                             #How many chunks mentioned it 
            })
    return pd.DataFrame(rows).sort_values(by="coverage", ascending=False).reset_index(drop=True)


Overwriting ../src/aspects.py


Testing code

In [11]:
import pandas as pd

df = pd.DataFrame({
    "product": ["A","A","B","B","A"],
    "rating":  [5,   2,   4,   3,   1],
    "chunk": [
        "La batería dura mucho y la calidad es muy buena.",
        "Precio caro y la batería se descargó rápido.",
        "El envío llegó al día siguiente, excelente entrega.",
        "No sé, la calidad del material es regular.",
        "Paquetería tardó y el empaque llegó roto; mala entrega."
    ]
})
df.head()

Unnamed: 0,product,rating,chunk
0,A,5,La batería dura mucho y la calidad es muy buena.
1,A,2,Precio caro y la batería se descargó rápido.
2,B,4,"El envío llegó al día siguiente, excelente ent..."
3,B,3,"No sé, la calidad del material es regular."
4,A,1,Paquetería tardó y el empaque llegó roto; mala...


In [12]:
# Tag ascpets and see result
def_tag= tag_aspects(df)
def_tag

Unnamed: 0,product,rating,chunk,battery,price,quality,durability,shipping
0,A,5,La batería dura mucho y la calidad es muy buena.,True,False,True,False,False
1,A,2,Precio caro y la batería se descargó rápido.,True,True,False,False,False
2,B,4,"El envío llegó al día siguiente, excelente ent...",False,False,False,False,True
3,B,3,"No sé, la calidad del material es regular.",False,False,True,False,False
4,A,1,Paquetería tardó y el empaque llegó roto; mala...,False,False,False,False,True


In [49]:
#now sentiment 

rating = def_tag["rating"].values
print(type(rating))
print(rating.shape)
results= [sentiment_from_rating(r) for r in rating]
#print(results)
for i, (r, res) in enumerate(zip(rating, results), 1):
    print(f"Index: {i}, Rating: {r}, Result from sentiment_from_rating: {res}")
   # print(f"{results[x]}")

<class 'numpy.ndarray'>
(5,)
Index: 1, Rating: 5, Result from sentiment_from_rating: 1
Index: 2, Rating: 2, Result from sentiment_from_rating: -1
Index: 3, Rating: 4, Result from sentiment_from_rating: 1
Index: 4, Rating: 3, Result from sentiment_from_rating: 0
Index: 5, Rating: 1, Result from sentiment_from_rating: -1


In [13]:
#Add (global and per product)
summary_all= aggregate_aspects(def_tag)
summary_prd= aggregate_aspects(def_tag, product="A")

print("Global summary:")
display(summary_all)

print("\nSummary product A:")
display(summary_prd)

Global summary:


Unnamed: 0,aspect,coverage,mean_polarity,n
0,battery,40.0,0.0,2
1,quality,40.0,0.5,2
2,shipping,40.0,0.0,2
3,price,20.0,-1.0,1
4,durability,0.0,0.0,0



Summary product A:


Unnamed: 0,aspect,coverage,mean_polarity,n
0,battery,66.67,0.0,2
1,price,33.33,-1.0,1
2,quality,33.33,1.0,1
3,shipping,33.33,-1.0,1
4,durability,0.0,0.0,0
