In [None]:
import os
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.metrics import cohen_kappa_score
from scipy.stats import skew
from scipy.stats import kurtosis
from statistics import variance

In [None]:
# load the data
path = os.path.join("data", "sample")
train = pd.read_excel(os.path.join(path, "train.xlsx"))
test = pd.read_excel(os.path.join(path, "test.xlsx"))
validation = pd.read_excel(os.path.join(path, "valid.xlsx"))

df = pd.concat([train, test, validation])

In [None]:
df.columns
df.min()
df.max()

emotions = [
    "joy_x",
    "trust_x",
    "anticipation_x",
    "surprise_x",
    "fear_x",
    "sadness_x",
    "disgust_x",
    "anger_x",
    "valence_x",
    "arousal_x",
    "joy_y",
    "trust_y",
    "anticipation_y",
    "surprise_y",
    "fear_y",
    "sadness_y",
    "disgust_y",
    "anger_y",
    "valence_y",
    "arousal_y",
]

### Functions

#### ksdensity

In [None]:
def create_histogram(sample, which_col, person):
    """sample - one column
    histogram + median + mean
    person = { X , Y}
    which_col = {joy, trust, anticipation, ...}"""
    plt.figure(0)

    result = plt.hist(
        sample, bins=40, color="mediumseagreen", edgecolor="k", alpha=0.65
    )
    plt.axvline(sample.mean(), color="k", linestyle="dashed", linewidth=1)
    plt.axvline(sample.median(), color="k", linestyle="dashed", linewidth=1)

    min_ylim, max_ylim = plt.ylim()
    plt.text(
        sample.mean() * 1.1,
        max_ylim * 0.9,
        "Mean: {:.2f}".format(sample.mean()),
    )
    plt.text(
        sample.median() * 1.1,
        max_ylim * 0.8,
        "Median: {:.2f}".format(sample.median()),
    )

    plt.grid()
    plt.title(
        "Histogram of {which_col} for a person {person}".format(
            which_col=which_col, person=person
        ),
        loc="center",
    )
    plt.xlabel("Value of opinion")
    plt.ylabel("Frequency")
    plt.show()

#### ecdf

In [None]:
def create_ecdf(sample, which_col, person):
    """sample - one column
    person = {X , Y}
    which_col = {joy, trust, anticipation, ...}"""
    plt.figure(0)
    ecdf = ECDF(sample)
    plt.plot(ecdf.x, ecdf.y)

    plt.grid()
    plt.title(
        "Cumulative distribution function of {which_col} for a person {person}".format(
            which_col=which_col, person=person
        ),
        loc="center",
    )
    plt.xlabel("Sample")
    plt.ylabel("Probability")
    plt.show()

In [None]:
def normalization(df, emotions):
    # normalization  musi byc min-max a nie -mean/ std bo mamy wartosci ujemne
    # w kolumnach
    normalized_df = (df[emotions] - df[emotions].min()) / (
        df[emotions].max() - df[emotions].min()
    )
    return normalized_df

In [None]:
def create_boxplot(normalized_df, emotions):
    plt.figure(figsize=(16, 5))
    plt.title("Boxplot of min-max normalized datasets", fontsize=14)
    boxplot = normalized_df.boxplot(column=emotions, rot=45, fontsize=12)

#### heat map

In [None]:
def create_heatmap(df, person):
    plt.subplots(figsize=(18, 12))
    corr = df.corr()
    sns.heatmap(
        corr,
        xticklabels=corr.columns.values,
        yticklabels=corr.columns.values,
        linewidths=0.5,
        annot=True,
    ).set(title="Heat map for a person {person}".format(person=person))

#### Analysis for person X

In [None]:
for col_name in emotions[: len(emotions) // 2]:
    create_histogram(
        df[col_name],
        col_name[: len(col_name) - 2],
        "X",
    )
    create_ecdf(df[col_name], col_name[: len(col_name) - 2], "X")
create_heatmap(df, "X")

#### Analysis for person Y

In [None]:
for col_name in emotions[len(emotions) // 2 :]:
    create_histogram(df[col_name], col_name[: len(col_name) - 2], "Y")
    create_ecdf(df[col_name], col_name[: len(col_name) - 2], "Y")
create_heatmap(df, "Y")

#### Comparison of X and Y; mean opinion value
calculate mean value

In [None]:
joy_x = emotions[0]
joy_y = emotions[10]
new_col = (df[joy_x] + df[joy_y]) / 2
mean_opinion = new_col
numerical = joy_x
names = [numerical[: len(numerical) - 2]]

for i in range(1, 10):
    new_col = (df[emotions[i]] + df[emotions[i + 10]]) / 2
    mean_opinion = pd.concat([mean_opinion, new_col], axis=1)

    column_name = emotions[i]
    names.append(column_name[: len(column_name) - 2])

mean_opinion.columns = names

### Analysis

In [None]:
for col_name in names:
    create_histogram(mean_opinion[col_name], col_name, "mean(X,Y)")
    create_ecdf(mean_opinion[col_name], col_name, "mean(X,Y)")
create_heatmap(mean_opinion, "mean(X,Y)")

In [None]:
plt.hist(df["joy_x"], bins=10, alpha=0.5, label="joy_x")
plt.hist(df["joy_y"], bins=10, alpha=0.5, label="joy_y")
plt.xlabel("Data", size=14)
plt.legend(loc="upper right")

### Statistics

#### Skewness

In [None]:
for emotion in emotions:
    print(emotion)
    sample = df[emotion]
    print(skew(sample))

#### Kurtosis

In [None]:
for emotion in emotions:
    print(emotion)
    sample = df[emotion]
    print(kurtosis(sample))

#### Variance

In [None]:
for emotion in emotions:
    print(emotion)
    sample = df[emotion]
    print(variance(sample))

In [None]:
combinations = [
    ("joy_x", "joy_y"),
    ("trust_x", "trust_y"),
    ("anticipation_x", "anticipation_y"),
    ("surprise_x", "surprise_y"),
    ("fear_x", "fear_y"),
    ("sadness_x", "sadness_y"),
    ("disgust_x", "disgust_y"),
    ("anger_x", "anger_y"),
    ("valence_x", "valence_y"),
    ("arousal_x", "arousal_y"),
]

for comb in combinations:
    print(comb, cohen_kappa_score(df[comb[0]], df[comb[1]]))