<a href="https://www.kaggle.com/code/handandegerli/sorting-reviews?scriptVersionId=180890496" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
############################################
# SORTING REVIEWS
############################################

import pandas as pd
import math
import scipy.stats as st

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)



In [2]:
###################################################
# Up-Down Diff Score = (up ratings) − (down ratings)
###################################################

# Review 1: 600 up 400 down total 1000
# Review 2: 5500 up 4500 down total 10000

def score_up_down_diff(up, down):
    return up - down

# Review 1 Score:
score_up_down_diff(600, 400)

# Review 2 Score
score_up_down_diff(5500, 4500)

1000

In [3]:
###################################################
# Score = Average rating = (up ratings) / (all ratings)
###################################################

def score_average_rating(up, down):
    if up + down == 0:
        return 0
    return up / (up + down)

score_average_rating(600, 400)
score_average_rating(5500, 4500)

# Review 1: 2 up 0 down total 2
# Review 2: 100 up 1 down total 101

score_average_rating(2, 0)
score_average_rating(100, 1)


0.9900990099009901

In [4]:
###################################################
# Wilson Lower Bound Score
###################################################

# 600-400
# 0.6
# 0.5 0.7
# 0.5
#wlb ikili interactionlar sonucu ortaya cıkan butun olcme problerinde kul.
#Bernoulli iki olasılıktan birinin gerceklesmesi olasılıgını verir
#yuzde 95 guven ve yuzde 5 hata payı ile guven aralıgını biliyoruz en alt noktada referans noktasına tutunduk
def wilson_lower_bound(up, down, confidence=0.95):
    """
    Wilson Lower Bound Score hesapla

    - Bernoulli parametresi p için hesaplanacak güven aralığının alt sınırı WLB skoru olarak kabul edilir.
    - Hesaplanacak skor ürün sıralaması için kullanılır.
    - Not:
    Eğer skorlar 1-5 arasıdaysa 1-3 negatif, 4-5 pozitif olarak işaretlenir ve bernoulli'ye uygun hale getirilebilir.
    Bu beraberinde bazı problemleri de getirir. Bu sebeple bayesian average rating yapmak gerekir.

    Parameters
    ----------
    up: int
        up count
    down: int
        down count
    confidence: float
        confidence

    Returns
    -------
    wilson score: float

    """
    n = up + down
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)


wilson_lower_bound(600, 400)
wilson_lower_bound(5500, 4500)

wilson_lower_bound(2, 0)
wilson_lower_bound(100, 1)


0.9460328420055449

In [5]:
#elimizde artık bir puan işi hesabı oldugunda; bir avarage alabilirim ama bunu daha fazla hassaslastırabilirim
#ornegin zamana dayalı, user quality e dayalı, kullanıcı davranıslarına dayalı hassaslastırabılırım.
#elimizde bes yıldızlı bir rating oldugunda bayesen avarage rating ile hesaplayabilriiz.
#hybrid bi sistem kurabiliriz.

#sıralama yontemlerini gorduk; birden fazla faktorun etkili oldugu dolayısıyla agırlık verememiz gerektiğini gördük
#iş bazında da yontemler olabilir mesela IMDB nin yaptıgı ama aslında bu da bir rate lemedir.
#en genelde alt verimiz ratelemek; sonra sıralamak,bayesen mesela rate lerin olasılıksal dagılımına gore bize bir ortalama hesabı yapar
#bu ortalamyı nihayi ortalam olarak kulanabiliriz. yine burada da satın alma sayıları gb etkenleri de dahil edip hybrid bir yaklasımla
#puan dagılımları uzerinden olusan bar skorunu, satın alam sayılarını, normal ratingleri ve yorum sayılarını goz onunde bulundurarak hybrid bir çözüm ürettik
#son olarak yorum sıralamayı da wlb ile çözdük

###################################################
# Case Study
###################################################

up = [15, 70, 14, 4, 2, 5, 8, 37, 21, 52, 28, 147, 61, 30, 23, 40, 37, 61, 54, 18, 12, 68]
down = [0, 2, 2, 2, 15, 2, 6, 5, 23, 8, 12, 2, 1, 1, 5, 1, 2, 6, 2, 0, 2, 2]
comments = pd.DataFrame({"up": up, "down": down})



# score_pos_neg_diff
comments["score_pos_neg_diff"] = comments.apply(lambda x: score_up_down_diff(x["up"],
                                                                             x["down"]), axis=1)

# score_average_rating
comments["score_average_rating"] = comments.apply(lambda x: score_average_rating(x["up"], x["down"]), axis=1)

# wilson_lower_bound
comments["wilson_lower_bound"] = comments.apply(lambda x: wilson_lower_bound(x["up"], x["down"]), axis=1)



comments.sort_values("wilson_lower_bound", ascending=False)

Unnamed: 0,up,down,score_pos_neg_diff,score_average_rating,wilson_lower_bound
11,147,2,145,0.98658,0.95238
12,61,1,60,0.98387,0.91413
1,70,2,68,0.97222,0.90426
21,68,2,66,0.97143,0.90168
18,54,2,52,0.96429,0.87881
15,40,1,39,0.97561,0.87405
13,30,1,29,0.96774,0.83806
16,37,2,35,0.94872,0.83114
19,18,0,18,1.0,0.82412
17,61,6,55,0.91045,0.81807
