# Pearson Korrelation

In [2]:
import pandas as pd
from math import sqrt

<img src="img/pearson algorithm.png" alt="Drawing" style="width: 600px;"/>


In [3]:
def get_pearson(vector_x, vector_y):
    if len(vector_x) is not len(vector_y): raise Exception("vectors must have same length")
    length = len(vector_x)
    x = vector_x
    y = vector_y
    
    mean = lambda vector: sum(vector)/len(vector)
    x_mean = mean(x)
    y_mean = mean(y)
    
    sqr_vector = lambda vector: [element ** 2 for element in vector]
    x_sqr = sqr_vector(x)
    y_sqr = sqr_vector(y)
    
    x_y = [x[i]*y[i] for i in range(length)]
    
    top = (sum(x_y)-length*x_mean*y_mean)
    bottom = sqrt((sum(x_sqr)-length*x_mean**2)*(sum(y_sqr)-length*y_mean**2))
    return round(top / bottom,3)


<img src="img/pearson simplified.png" alt="Drawing" style="width: 600px;"/>

In [4]:
test = pd.DataFrame({"Name": ["x", "y"], "feat1": [3, 5], "feat2": [2, 3], "feat3": [4, 7]}).set_index("Name")
x = list(test.loc["x"])
y = list(test.loc["y"])
get_pearson(x,y) == 1

True

In [5]:
fighters = pd.DataFrame(
    {
        "Name": ["Aragorn", "Robin Hood", "Delenn", "Black Widow"],
        "Nahkampf": [5, 3, 2, 5],
        "Fernkampf": [1, 5, 4, 3],
        "Gut": [4, 5, 4, 3],
        "General": [4, 3, 5, 0],
        "Einzelkämpfer": [5, 3, 2, 5],
        "hochgeboren": [4, 2, 2, 1],
    }
).set_index("Name")
fighters


Unnamed: 0_level_0,Nahkampf,Fernkampf,Gut,General,Einzelkämpfer,hochgeboren
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aragorn,5,1,4,4,5,4
Robin Hood,3,5,5,3,3,2
Delenn,2,4,4,5,2,2
Black Widow,5,3,3,0,5,1


Wer soll nebereinander sitzen, damit Gute Gespräche entstehen?

In [6]:
match={"Name":[]}
for name, fighter in fighters.iterrows():
    match["Name"].append(name)
    match[name]=[get_pearson(fighter,other) for name_b, other in fighters.iterrows()]
pd.DataFrame(match)

Unnamed: 0,Name,Aragorn,Robin Hood,Delenn,Black Widow
0,Aragorn,1.0,-0.61,-0.494,0.255
1,Robin Hood,-0.61,1.0,0.553,0.2
2,Delenn,-0.494,0.553,1.0,-0.577
3,Black Widow,0.255,0.2,-0.577,1.0


# Kosinus Ähnlichkeit
<img src="img/pearsonKosinus.png" alt="Drawing" style="width: 600px;"/>

In [71]:
def get_cosinus(vector_u, vector_v):
    u = []
    v = []
    for element_u, element_v in zip(vector_u, vector_v):
        # https://stackoverflow.com/a/44154660/14551419
        if (element_u == element_u)  and (element_v == element_v):
            u.append(element_u)
            v.append(element_v)

    sqr_vector = lambda vector: [element ** 2 for element in vector]
    u_sqr = sqr_vector(u)
    v_sqr = sqr_vector(v)

    u_v = [element_u * element_v for element_u, element_v in zip(u, v)]
    top = sum(u_v)
    bottom = sqrt(sum(u_sqr) * sum(v_sqr))
    return top / bottom



In [24]:
toys = pd.DataFrame(
    {
        "Name": ["Anna", "Bertram", "Cristin", "Dietmar", "Erika", "Friedrich"],
        "Klötze": [4, 0, 1, 4, 2, 1],
        "Barbie": [None, 1, 3, None, 5, 3],
        "Comic": [None, 2, 5, 0, 3, None],
        "Frisbee": [3, 4, 0, None, None, 5],
        "Münze": [0, 2, 3, 0, None, 3],
    }
)
toys = toys.set_index("Name")
toys


Unnamed: 0_level_0,Klötze,Barbie,Comic,Frisbee,Münze
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Anna,4,,,3.0,0.0
Bertram,0,1.0,2.0,4.0,2.0
Cristin,1,3.0,5.0,0.0,3.0
Dietmar,4,,0.0,,0.0
Erika,2,5.0,3.0,,
Friedrich,1,3.0,,5.0,3.0


In [57]:
def normalize(df):
    return df.apply(lambda x: round(x - x.mean(),3), axis=1)


In [58]:
toys_normalized = normalize(toys)
toys_normalized

Unnamed: 0_level_0,Klötze,Barbie,Comic,Frisbee,Münze
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Anna,1.667,,,0.667,-2.333
Bertram,-1.8,-0.8,0.2,2.2,0.2
Cristin,-1.4,0.6,2.6,-2.4,0.6
Dietmar,2.667,,-1.333,,-1.333
Erika,-1.333,1.667,-0.333,,
Friedrich,-2.0,0.0,,2.0,0.0


In [78]:
def get_cosinus_similarity(df):
    match = {"toy": []}
    for toy in toys_normalized:
        match["toy"].append(toy)
        vector_toy = list(toys_normalized[toy])
        match[toy] = [
            round(get_cosinus(vector_toy, list(toys_normalized[other])), 3)
            for other in toys_normalized
        ]
    df = pd.DataFrame(match)
    df = df.set_index("toy")
    return df


In [85]:
similarity=get_cosinus_similarity(toys_normalized)
similarity.loc["Barbie"]

Klötze    -0.252
Barbie     1.000
Comic      0.165
Frisbee   -0.837
Münze      0.316
Name: Barbie, dtype: float64

<img src="img/pearson3.png" alt="Drawing" style="width: 600px;"/>

In [93]:
def prediction(person_vector, similarity_vector):     
    top_to_sum=[] 
    bot_to_sum=[]
    for p, s in zip(person_vector, similarity_vector):
          if p==p and s==s:
              top_to_sum.append(p*s)
              bot_to_sum.append(abs(s))
    top=sum(top_to_sum)
    bot=sum(bot_to_sum)
    return top/bot

In [94]:
prediction(toys.loc["Anna"],similarity.loc["Barbie"])

In [89]:
toys.loc["Anna"] 

Klötze     4.0
Barbie     NaN
Comic      NaN
Frisbee    3.0
Münze      0.0
Name: Anna, dtype: float64