In [1]:
%load_ext autoreload
%autoreload

# Statistics

This notebook contains plots and other code to explore guesses of the players of the game.

### Table of Content:

* ['Guesses' stats](#guess-stats)
* [Ground truth related stats](#ground-truth-stats)


In [2]:
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np
%run ./prepare_data.ipynb






## <a class="anchor" id="guess-stats">Guesses stats</a>

### Most popular and least votes

In [3]:
# trim the long url string for readibility
def trim_url(df, df_idx, str_lst):
    df_simple = df.copy()
    for s in str_lst:
        df_simple[df_idx] = df_simple[df_idx].str.replace(s, '')
    return df_simple

#def highlight_rows(df,v):
#    if df.col > v:
#        return ['background-color: yellow']*df.shape[1]
#    else:
#        return ['background-color: white']*df.shape[1]
#df_guess_simple.style.apply(highlight_rows(0), axis=1)

In [4]:
yt_str = ['^[^_]*=','https://youtu.be/']

#print all votes
if ((df_guess['Current URL']).apply(len).max(axis=0) > 10):
    df_guess_simple = trim_url(df_guess,'Current URL',yt_str)
else:
    df_guess_simple = df_guess
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # printing all, more options can be specified also
    print(df_guess_simple)



    Current URL         Guess  votes
0   T5tGEv9cegY          Фуад      6
1   nszMBbCZKo4  Ali Huseynov      6
2   H9Z3_ifFheQ       Kamusik      6
3   yawlrsRDabA          Фуад      6
4   AxLH0lXEGAY         Girey      4
5   T8b2p9NWjdM         Aslan      4
6   9pBmkOB1P8U         Aslan      4
7   J3eeh15GXj0       Alesker      4
8   GipD4SbJaD0          Zaur      4
9   90cyDLcU-3g       Kamusik      4
10  T8b2p9NWjdM          Zaur      3
11  J3eeh15GXj0          Emin      3
12  GipD4SbJaD0          Emin      3
13  6ybd5rbQ5rU         Aslan      3
14  yawlrsRDabA         Girey      3
15  6ybd5rbQ5rU       Alesker      3
16  9pBmkOB1P8U       Alesker      3
17  6ybd5rbQ5rU          Emin      3
18  nszMBbCZKo4          Фуад      2
19  90cyDLcU-3g         Ирада      2
20  GipD4SbJaD0  Ali Huseynov      2
21  J3eeh15GXj0          Zaur      2
22  AxLH0lXEGAY          Zaur      2
23  H9Z3_ifFheQ        Rena A      2
24  9pBmkOB1P8U        Rena A      2
25  90cyDLcU-3g        Rena A      2
2

## <a class="anchor" id="ground-truth-stats">Ground truth related stats </a>

First, we compute the intersection between players/url column of ground truth df and guesses/url column of guess df, for all URLs.
Note that one guess is inherently incorrect (by the rules of the game).



In [5]:
# Correct guesses
correct_guess_df = pd.merge(df_raw,df2_raw, how='inner', left_on=['Current URL', 'Guess'], right_on=['URL','Player'])
#del correct_guess_df['Player_y'], correct_guess_df['URL']
correct_guess_df = correct_guess_df.drop(['Player_y', 'URL'], axis = 1)
correct_guess_df.rename(columns = {'Player_x':'Player','Current URL':'URL'}, inplace = True)

correct_guess_simple = trim_url(correct_guess_df, 'URL', yt_str)
print(correct_guess_simple)


           URL        Player    Guess
0  AxLH0lXEGAY          Zaur     Emin
1  H9Z3_ifFheQ         Girey  Alesker
2  6ybd5rbQ5rU         Girey  Rena B.
3  T8b2p9NWjdM         Girey   Rena A
4  nszMBbCZKo4          Emin     Zaur
5  9pBmkOB1P8U         Ирада    Girey
6  90cyDLcU-3g          Zaur  Kamusik
7  90cyDLcU-3g  Ali Huseynov  Kamusik
8  90cyDLcU-3g         Girey  Kamusik
9  90cyDLcU-3g         Ирада  Kamusik


In [6]:
# Correct guesses per player
#TODO: display this better

corr_guess_stats = correct_guess_df.groupby(by=['Player'])['Guess'].count()
corr_guess_stats = pd.merge(df2_raw, corr_guess_stats, on='Player',how='left').fillna(0)
del corr_guess_stats['URL']
print(corr_guess_stats)

          Player  Guess
0        Rena B.    0.0
1          Aslan    0.0
2   Ali Huseynov    1.0
3          Ирада    2.0
4          Girey    4.0
5           Фуад    0.0
6         Rena A    0.0
7        Alesker    0.0
8           Zaur    2.0
9           Emin    1.0
10       Kamusik    0.0


In [7]:
# Best and worst guessed players
#TODO: display this better

corr_player_stats = correct_guess_df.groupby(by=['URL'])['Guess'].count()
corr_player_stats = pd.merge(df2_raw, corr_player_stats, on='URL',how='left').fillna(0)
del corr_player_stats['URL']
#corr_player_stats['Guess'] = pd.to_numeric(corr_player_stats['Guess'])
corr_player_stats['Guess'] = corr_player_stats['Guess'].values.astype(int)
print(corr_player_stats)

          Player  Guess
0        Rena B.      1
1          Aslan      0
2   Ali Huseynov      0
3          Ирада      0
4          Girey      1
5           Фуад      0
6         Rena A      1
7        Alesker      1
8           Zaur      1
9           Emin      1
10       Kamusik      4


In [8]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # printing all, more options can be specified also
    print(total_votes)


          Player  votes
0        Alesker     14
1           Фуад     14
2        Kamusik     13
3           Zaur     13
4          Aslan     13
5          Girey     12
6           Emin     11
7   Ali Huseynov      9
8         Rena A      8
9          Ирада      7
10       Rena B.      7


In [9]:
def subtract(df1,df2,col_idx,col_minuend,col_subth):
    wrong_votes = pd.merge(df1,df2, on=[col_idx])
    wrong_votes["delta"] = (wrong_votes[col_minuend]) - wrong_votes[col_subth]
    wrong_votes = wrong_votes.drop([col_minuend, col_subth], axis = 1)
    wrong_votes.rename(columns = {'delta':col_minuend}, inplace = True)
    return wrong_votes

wrong_votes = subtract(total_votes,corr_player_stats,'Player','votes','Guess')
print(wrong_votes)

          Player  votes
0        Alesker     13
1           Фуад     14
2        Kamusik      9
3           Zaur     12
4          Aslan     13
5          Girey     11
6           Emin     10
7   Ali Huseynov      9
8         Rena A      7
9          Ирада      7
10       Rena B.      6
