<a href="https://colab.research.google.com/github/GregoryG3/Thesis/blob/main/Scene_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries and loading Google drive

In [None]:
# !pip install pingouin

Collecting pingouin
  Downloading pingouin-0.5.4-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.9/198.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pandas-flavor (from pingouin)
  Downloading pandas_flavor-0.6.0-py3-none-any.whl (7.2 kB)
Installing collected packages: pandas-flavor, pingouin
Successfully installed pandas-flavor-0.6.0 pingouin-0.5.4


In [1]:
import pandas as pd
import numpy as np
from os.path import join


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
PROJECT_DIR = "/content/drive/MyDrive/Thesis/Dataset"
PROCESSED_DIR = join(PROJECT_DIR, "processed")

# Load data

In [81]:
ratings = pd.read_csv(join(PROCESSED_DIR, "ratings.csv"), low_memory=False)

In [5]:
ratings.head(3)

Unnamed: 0,index,scene_id,duration,rating,sessionID
0,0,01_MS_C_344,3679.0,3.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8
1,1,01_CP_C_1226,2835.0,3.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8
2,2,01_SE_C_54,6589.0,1.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8


In [6]:
ratings.describe()

Unnamed: 0,index,duration,rating
count,468370.0,468370.0,468370.0
mean,14.24767,7318.973,1.987277
std,21.349735,635557.4,0.945056
min,0.0,0.0,0.0
25%,5.0,2442.0,1.0
50%,11.0,4002.0,2.0
75%,18.0,6500.0,3.0
max,664.0,421131900.0,3.0


# Sample selection
Let's choose the scene id which occurs in dataset most often. I wanna choose scene which are scored from different perspective by at least 250 people

In [82]:
ratings[['Symbol1', 'Experiment', 'Kamera', 'number_sym']] = ratings['scene_id'].str.split('_', expand=True)
ratings.drop(columns=['Symbol1','index'], inplace=True)

In [83]:
ratings.head()

Unnamed: 0,scene_id,duration,rating,sessionID,Experiment,Kamera,number_sym
0,01_MS_C_344,3679.0,3.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8,MS,C,344
1,01_CP_C_1226,2835.0,3.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8,CP,C,1226
2,01_SE_C_54,6589.0,1.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8,SE,C,54
3,01_SE_C_26,2811.0,2.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8,SE,C,26
4,01_MS_C_290,6401.0,1.0,6bbf3fed-46ab-4221-930b-4ac3fa0acff8,MS,C,290


In [84]:
reshape_df = ratings.groupby(['scene_id', 'Experiment','Kamera','number_sym']).size().reset_index(name='count')
reshape_df

Unnamed: 0,scene_id,Experiment,Kamera,number_sym,count
0,01_CP_A_136,CP,A,136,90
1,01_CP_A_137,CP,A,137,111
2,01_CP_A_138,CP,A,138,80
3,01_CP_A_139,CP,A,139,75
4,01_CP_A_140,CP,A,140,78
...,...,...,...,...,...
3013,01_SE_C_86,SE,C,86,491
3014,01_SE_C_87,SE,C,87,486
3015,01_SE_C_88,SE,C,88,474
3016,01_SE_C_89,SE,C,89,495


# Scenes with N>250

We have 471 scenes which have more than 250 reviewers (total was 3018 unique scenes)

In [85]:
selected_rows = reshape_df[reshape_df['count'] > 250]
selected_rows

Unnamed: 0,scene_id,Experiment,Kamera,number_sym,count
156,01_CP_C_1046,CP,C,1046,271
164,01_CP_C_1054,CP,C,1054,353
172,01_CP_C_1062,CP,C,1062,294
223,01_CP_C_1112,CP,C,1112,280
242,01_CP_C_1135,CP,C,1135,289
...,...,...,...,...,...
3013,01_SE_C_86,SE,C,86,491
3014,01_SE_C_87,SE,C,87,486
3015,01_SE_C_88,SE,C,88,474
3016,01_SE_C_89,SE,C,89,495


Now we can calculate the rating score per each scene and Coefficient of variation for ratings (CV)


In [86]:
# Grouping the DataFrame by 'scene_id'
grouped = ratings.loc[ratings['scene_id'].isin(selected_rows['scene_id'].tolist())].groupby('scene_id')

# Aggregating statistics for each group
statistics = grouped.agg(
    count=('rating', 'count'),  # Counting the number of ratings in each group
    rating_mean=('rating', 'mean'),  # Counting the mean for ratings
    rating_cv=('rating', lambda x: np.std(x) / np.mean(x) if np.mean(x) != 0 else np.nan)  # Calculating the coefficient of variation for ratings
)

# Calculating the count of each rating value
ratings_counts = ['rating_0_count', 'rating_1_count', 'rating_2_count', 'rating_3_count']
for i, rating_count in enumerate(ratings_counts):
    statistics[rating_count] = grouped['rating'].apply(lambda x: (x == i).sum())

# Calculating the percentage share for each rating value
for i, rating_count in enumerate(ratings_counts):
    statistics[f'rating_{i}'] = (statistics[rating_count] / statistics['count'] * 100).round(2)

# Calculating the 'safe_index' as the sum of the percentages of ratings 2 and 3
statistics['safe_index'] = statistics['rating_3'] + statistics['rating_2']

ratings_summary = statistics.sort_values(by = 'safe_index', ascending = False)
ratings_summary=ratings_summary.reset_index('scene_id')
ratings_summary[['Symbol1', 'Experiment', 'Kamera', 'number_sym']] = ratings_summary['scene_id'].str.split('_', expand=True)

ratings_summary.head()

Unnamed: 0,scene_id,count,rating_mean,rating_cv,rating_0_count,rating_1_count,rating_2_count,rating_3_count,rating_0,rating_1,rating_2,rating_3,safe_index,Symbol1,Experiment,Kamera,number_sym
0,01_CP_P_790,368,2.690217,0.184525,0,6,102,260,0.0,1.63,27.72,70.65,98.37,1,CP,P,790
1,01_SE_C_10,479,2.906054,0.134126,4,4,25,446,0.84,0.84,5.22,93.11,98.33,1,SE,C,10
2,01_CP_C_1054,353,2.671388,0.195093,1,6,101,245,0.28,1.7,28.61,69.41,98.02,1,CP,C,1054
3,01_CP_C_1046,271,2.690037,0.194207,1,5,71,194,0.37,1.85,26.2,71.59,97.79,1,CP,C,1046
4,01_CP_P_876,359,2.682451,0.198515,1,9,93,256,0.28,2.51,25.91,71.31,97.22,1,CP,P,876


## Scenes with two perspective and N>250

In [87]:
# Select rows where 'number_sym' is not unique and 'count' > 250, then sort by ['number_sym', 'Experiment']:
non_unique_rows = reshape_df[reshape_df['count'] > 250].groupby(['number_sym','Experiment']).filter(lambda x: len(x) > 1).sort_values(by='number_sym')

non_unique_rows

Unnamed: 0,scene_id,Experiment,Kamera,number_sym,count
1446,01_MS_A_1,MS,A,1,286
2874,01_SE_A_1,SE,A,1,315
2150,01_MS_C_1,MS,C,1,386
2940,01_SE_C_1,SE,C,1,444
2942,01_SE_C_11,SE,C,11,485
...,...,...,...,...,...
2937,01_SE_A_87,SE,A,87,337
2938,01_SE_A_88,SE,A,88,312
3015,01_SE_C_88,SE,C,88,474
2939,01_SE_A_89,SE,A,89,304


We got 152 entities, only MS and SE type. We can add this information to our ratings_summary data frame:

In [88]:
matching_indices = ratings_summary.index.intersection(non_unique_rows['scene_id'])
ratings_summary['two_perspectives'] = 0  # Dodajemy kolumnę z domyślną wartością 0
ratings_summary.loc[matching_indices, 'two_perspectives'] = 1

In [89]:
ratings_summary.drop('Symbol1', axis=1, inplace=True)

In [90]:
ratings_summary

Unnamed: 0,scene_id,count,rating_mean,rating_cv,rating_0_count,rating_1_count,rating_2_count,rating_3_count,rating_0,rating_1,rating_2,rating_3,safe_index,Experiment,Kamera,number_sym,two_perspectives
0,01_CP_P_790,368,2.690217,0.184525,0,6,102,260,0.00,1.63,27.72,70.65,98.37,CP,P,790,0
1,01_SE_C_10,479,2.906054,0.134126,4,4,25,446,0.84,0.84,5.22,93.11,98.33,SE,C,10,0
2,01_CP_C_1054,353,2.671388,0.195093,1,6,101,245,0.28,1.70,28.61,69.41,98.02,CP,C,1054,0
3,01_CP_C_1046,271,2.690037,0.194207,1,5,71,194,0.37,1.85,26.20,71.59,97.79,CP,C,1046,0
4,01_CP_P_876,359,2.682451,0.198515,1,9,93,256,0.28,2.51,25.91,71.31,97.22,CP,P,876,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
466,01_SE_A_79,306,0.552288,1.432991,180,98,13,15,58.82,32.03,4.25,4.90,9.15,SE,A,79,0
467,01_SE_C_66,486,0.477366,1.494167,307,136,33,10,63.17,27.98,6.79,2.06,8.85,SE,C,66,0
468,01_MS_C_6,387,0.485788,1.429674,238,116,27,6,61.50,29.97,6.98,1.55,8.53,MS,C,6,0
469,01_MS_C_589,401,0.413965,1.650866,270,106,15,10,67.33,26.43,3.74,2.49,6.23,MS,C,589,0


In [91]:
ratings_summary.to_csv(join(PROCESSED_DIR, "ratings_selected_scenes.csv"), index=False)

# USUN

 Lets save the scene_id of these entities

In [12]:
scene_id_MS = non_unique_rows.loc[non_unique_rows['Experiment'] == 'MS', 'scene_id'].tolist()
scene_id_SE = non_unique_rows.loc[non_unique_rows['Experiment'] == 'SE', 'scene_id'].tolist()

print("scene_id dla Experiment MS:", scene_id_MS)
print("scene_id dla Experiment SE:", scene_id_SE)

scene_id dla Experiment MS: ['01_MS_A_1', '01_MS_C_1', '01_MS_C_2', '01_MS_A_2', '01_MS_A_4', '01_MS_C_4', '01_MS_C_5', '01_MS_A_5', '01_MS_A_585', '01_MS_C_585', '01_MS_C_586', '01_MS_A_586', '01_MS_A_588', '01_MS_C_588', '01_MS_A_589', '01_MS_C_589', '01_MS_A_590', '01_MS_C_590', '01_MS_A_592', '01_MS_C_592', '01_MS_A_6', '01_MS_C_6', '01_MS_C_7', '01_MS_A_7', '01_MS_A_8', '01_MS_C_8']
scene_id dla Experiment SE: ['01_SE_A_1', '01_SE_C_1', '01_SE_C_11', '01_SE_A_11', '01_SE_A_12', '01_SE_C_12', '01_SE_C_13', '01_SE_A_13', '01_SE_C_14', '01_SE_A_14', '01_SE_A_16', '01_SE_C_16', '01_SE_A_17', '01_SE_C_17', '01_SE_C_18', '01_SE_A_18', '01_SE_C_19', '01_SE_A_19', '01_SE_A_2', '01_SE_C_2', '01_SE_A_21', '01_SE_C_21', '01_SE_C_22', '01_SE_A_22', '01_SE_C_23', '01_SE_A_23', '01_SE_C_24', '01_SE_A_24', '01_SE_C_3', '01_SE_A_3', '01_SE_A_31', '01_SE_C_31', '01_SE_C_32', '01_SE_A_32', '01_SE_A_33', '01_SE_C_33', '01_SE_C_34', '01_SE_A_34', '01_SE_C_36', '01_SE_A_36', '01_SE_C_37', '01_SE_A_37'

In [13]:
ms_df = ratings.loc[ratings['scene_id'].isin(scene_id_MS), ['scene_id', 'rating', 'sessionID']]
se_df = ratings.loc[ratings['scene_id'].isin(scene_id_SE), ['scene_id', 'rating', 'sessionID']]

## MS type

In [16]:
# Grouping the DataFrame by 'scene_id'
grouped = ms_df.groupby('scene_id')

# Aggregating statistics for each group
statistics = grouped.agg(
    count=('rating', 'count'),  # Counting the number of ratings in each group
    rating_mean=('rating', 'mean'),  # Counting the mean for ratings
    rating_cv=('rating', lambda x: np.std(x) / np.mean(x) if np.mean(x) != 0 else np.nan)  # Calculating the coefficient of variation for ratings
)

# Calculating the count of each rating value
ratings_counts = ['rating_0_count', 'rating_1_count', 'rating_2_count', 'rating_3_count']
for i, rating_count in enumerate(ratings_counts):
    statistics[rating_count] = grouped['rating'].apply(lambda x: (x == i).sum())

# Calculating the percentage share for each rating value
for i, rating_count in enumerate(ratings_counts):
    statistics[f'rating_{i}'] = (statistics[rating_count] / statistics['count'] * 100).round(2)

# Calculating the 'safe_index' as the sum of the percentages of ratings 2 and 3
statistics['safe_index'] = statistics['rating_3'] + statistics['rating_2']

ms_df_summary = statistics
ms_df_summary.head()

Unnamed: 0_level_0,count,rating_cv,rating_0_count,rating_1_count,rating_2_count,rating_3_count,rating_0,rating_1,rating_2,rating_3,safe_index
scene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
01_MS_A_1,286,0.82223,79,118,60,29,27.62,41.26,20.98,10.14,31.12
01_MS_A_2,258,0.710686,49,133,52,24,18.99,51.55,20.16,9.3,29.46
01_MS_A_4,268,0.820677,75,110,59,24,27.99,41.04,22.01,8.96,30.97
01_MS_A_5,259,0.949458,92,121,30,16,35.52,46.72,11.58,6.18,17.76
01_MS_A_585,266,1.063659,111,100,32,23,41.73,37.59,12.03,8.65,20.68


## SE Type

In [17]:
# Grouping the DataFrame by 'scene_id'
grouped = se_df.groupby('scene_id')

# Aggregating statistics for each group
statistics = grouped.agg(
    count=('rating', 'count'),  # Counting the number of ratings in each group
    rating_mean=('rating', 'mean'),  # Counting the mean for ratings
    rating_cv=('rating', lambda x: np.std(x) / np.mean(x) if np.mean(x) != 0 else np.nan)  # Calculating the coefficient of variation for ratings
)

# Calculating the count of each rating value
ratings_counts = ['rating_0_count', 'rating_1_count', 'rating_2_count', 'rating_3_count']
for i, rating_count in enumerate(ratings_counts):
    statistics[rating_count] = grouped['rating'].apply(lambda x: (x == i).sum())

# Calculating the percentage share for each rating value
for i, rating_count in enumerate(ratings_counts):
    statistics[f'rating_{i}'] = (statistics[rating_count] / statistics['count'] * 100).round(2)

# Calculating the 'safe_index' as the sum of the percentages of ratings 2 and 3
statistics['safe_index'] = statistics['rating_3'] + statistics['rating_2']

se_df_summary = statistics
se_df_summary.head()

Unnamed: 0_level_0,count,rating_cv,rating_0_count,rating_1_count,rating_2_count,rating_3_count,rating_0,rating_1,rating_2,rating_3,safe_index
scene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
01_SE_A_1,315,0.715844,68,114,92,41,21.59,36.19,29.21,13.02,42.23
01_SE_A_11,314,0.725388,67,148,74,25,21.34,47.13,23.57,7.96,31.53
01_SE_A_12,283,0.636821,43,125,84,31,15.19,44.17,29.68,10.95,40.63
01_SE_A_13,335,0.622182,46,141,99,49,13.73,42.09,29.55,14.63,44.18
01_SE_A_14,338,0.725376,71,144,84,39,21.01,42.6,24.85,11.54,36.39


# Check HOW many reviewers score the same scenes
if reviewer_1 rates e.g. scenes 1 and 2, how many recent reviewers are there who also rated both of these scenes?

In [81]:
from collections import defaultdict
import itertools


In [87]:
df = ratings

# Tworzenie słownika, gdzie klucze to scene_id, a wartości to zestawy osób, które oceniły daną scenę
scene_reviewers = defaultdict(set)
for scene, reviewer in zip(df['scene_id'], df['sessionID']):
    scene_reviewers[scene].add(reviewer)

# Inicjowanie list do przechowywania wyników
result = []

# Obliczenie liczby wspólnych osób i tworzenie wynikowej tabeli
for scene1, scene2 in itertools.combinations(scene_reviewers.keys(), 2):
    common_reviewers_count = len(scene_reviewers[scene1] & scene_reviewers[scene2])
    total_reviewers_scene1 = len(scene_reviewers[scene1])
    total_reviewers_scene2 = len(scene_reviewers[scene2])
    total_reviewers_both_scenes = len(scene_reviewers[scene1] | scene_reviewers[scene2])
    result.append([scene1, scene2, common_reviewers_count, total_reviewers_scene1, total_reviewers_scene2, total_reviewers_both_scenes])

# Tworzenie ramki danych z wynikami
result_df = pd.DataFrame(result, columns=['Scene1', 'Scene2', 'Common_Reviewers', 'Total_Reviewers_Scene1', 'Total_Reviewers_Scene2', 'Total_Reviewers_Both_Scenes'])

In [84]:
result_df.sort_values(by = 'Common_Reviewers')

Unnamed: 0,Scene1,Scene2,Common_Reviewers,Total_Reviewers_Scene1,Total_Reviewers_Scene2,Total_Reviewers_Both_Scenes
3013952,01_MS_A_931,01_CP_C_810,0,91,109,200
3885597,01_MS_C_1112,01_MS_C_530,0,98,118,216
1382171,01_CP_C_878,01_MS_A_569,0,103,102,205
2829145,01_CP_P_884,01_CP_C_514,0,413,141,554
1382169,01_CP_C_878,01_CP_C_94,0,103,125,228
...,...,...,...,...,...,...
179365,01_SE_C_13,01_SE_C_18,35,448,504,917
4547756,01_CP_C_626,01_CP_C_540,35,374,340,679
4551660,01_CP_C_540,01_CP_C_529,36,340,248,552
4514878,01_MS_C_1335,01_CP_C_626,39,415,374,750
