In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import os
import plotly.graph_objs as go

In [None]:
config = {
  'toImageButtonOptions': {
    'format': 'png',
    'filename': 'custom_image',
    'height': 500,
    'width': 700,
    'scale':6
  }
}

In [None]:
# Avg NDCG scores

path = '/content/drive/MyDrive/Msc Data Science/Information retrieval/Plot_project/evaluation_results'
all_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.csv')]
dfs = {}
times = [0.1,1,2,5,10,20]
for i, file in enumerate(all_files):
    dfs[times[i]] = pd.read_csv(file)

print(dfs[0.1].mean())

for key, df in dfs.items():
  df = df.dropna()
  mean_df = pd.DataFrame(df.mean()).transpose()
  mean_df = mean_df.drop('bm25_impl_one',axis=1)
  df_melted = mean_df.melt(id_vars=['qid'], var_name=f'Model evaluation@{key}', value_name='Mean over queries')
  dfs[key] = mean_df

sorted_keys = list(sorted(dfs.keys()))
sorted_values = [dfs[key] for key in sorted_keys]
df_melted = pd.concat(sorted_values)
df = df_melted.drop('qid',axis=1)
df["Percentage of Documents from Collection"] = times

df = df.melt(id_vars=['Percentage of Documents from Collection'], var_name=f'Model evaluation', value_name='Average NDCG Score over Queries')

fig = px.line(df, x='Percentage of Documents from Collection', y='Average NDCG Score over Queries', color='Model evaluation', title='Line Chart of the NDCG Scores of Models on Percentage of Documents')

fig.write_html('NDCG_models_times.html')
fig.show(config=config)

qid              832142.400000
ql_ndcg               0.707919
bm25_ndcg             0.709441
cos_sim_ndcg          0.619865
bm25Okapi             0.699273
bm25_impl_one         0.709441
dtype: float64


In [None]:
times = [0.1,1,2,5,10,20]
times_df = pd.DataFrame(columns=['percentage_of_col',
                                 'ql_ndcg',
                                 'bm25_ndcg',
                                 'cos_sim_ndcg',
                                 'bm25Okapi'])

for time in times:
  df = pd.read_csv(f'/content/drive/MyDrive/Msc Data Science/Information retrieval/Plot_project/evaluation_results/evaluation_results_{time}.csv').dropna()
  # print(df[['ql_ndcg', 'bm25_ndcg', 'cos_sim_ndcg', 'bm25Okapi']])

  df = df[['ql_ndcg', 'bm25_ndcg', 'cos_sim_ndcg', 'bm25Okapi']].mean()

  times_df.loc[len(times_df.index)] = [time] + list(df.values)

print(times_df)

df = times_df.melt(id_vars=['percentage_of_col'], var_name=f'Model evaluation', value_name='Average NDCG Score over Queries')
fig = px.line(df, x='percentage_of_col', y='Average NDCG Score over Queries', color='Model evaluation', title='Line Chart of the NDCG Scores of Models on Percentage of Documents')

fig.update_layout(yaxis_title='Average NDCG Score over Queries', xaxis_title='Percentage of Collection')
fig.write_html('NDCG_models_times.html')
fig.show(config=config)

   percentage_of_col   ql_ndcg  bm25_ndcg  cos_sim_ndcg  bm25Okapi
0                0.1  0.857346   0.861779      0.854007   0.861603
1                1.0  0.707919   0.709441      0.619865   0.699273
2                2.0  0.738637   0.783330      0.621311   0.785998
3                5.0  0.512768   0.563077      0.453591   0.553059
4               10.0  0.493970   0.518828      0.435294   0.514815
5               20.0  0.400302   0.458959      0.374228   0.448176


In [None]:
# Index creation time

df = pd.read_csv('/content/drive/MyDrive/Msc Data Science/Information retrieval/Plot_project/index_results.csv')
# create a subplot for each time column
fig = make_subplots()

#fig.add_trace(go.Scatter(x=df['percentage_of_col'], y=df['preprocessing_time'], mode='lines', name='Preprocessing Time'))
fig.add_trace(go.Scatter(x=df['percentage_of_col'], y=df['inverted_index_time'], mode='lines', name='Inverted Index Time'))
fig.add_trace(go.Scatter(x=df['percentage_of_col'], y=df['vercor_index_time'], mode='lines', name='Vector Index Time'))
fig.add_trace(go.Scatter(x=df['percentage_of_col'], y=df['existing_implemention_time'], mode='lines', name='Existing Implementation Time'))

fig.update_layout(height=800, width=600, title_text="Line Graphs of Percentage of Collection vs Processing Time", yaxis_title='Processing Time (sec)', xaxis_title='Percentage of Collection')

fig.write_html('percentage_col_time.html')

fig.show(config=config)


In [None]:
# Avg retrieval times

df = pd.read_csv('/content/drive/MyDrive/Msc Data Science/Information retrieval/Plot_project/Times_models.csv')
df = df.drop("Unnamed: 0", axis=1)
df = df.drop("BM25 initial implementation", axis=1)
df = df.melt(id_vars=['percentage_of_col'], var_name=f'Model', value_name='Processing Time')

fig = px.line(df, x='percentage_of_col', y='Processing Time', color='Model', title='Line Chart of Average Retrieval Times vs Percentage of Collection for Models')
fig.update_layout(yaxis_title='Retrieval Time (sec)', xaxis_title='Percentage of Collection')

fig.write_html('time_model.html')
fig.show(config=config)

In [None]:
# Grote indexen

df = pd.read_csv('/content/drive/MyDrive/Msc Data Science/Information retrieval/Plot_project/sizes.csv')
df = df.melt(id_vars=['percentage_of_col'], var_name=f'Model', value_name='Index Size')

fig = px.line(df, x='percentage_of_col', y='Index Size', color='Model', title='Line Chart of Index size vs Percentage of Collection for Models')
fig.update_layout(yaxis_title='Index Size (MB)', xaxis_title='Percentage of Collection')

fig.write_html('index_size_model.html')
fig.show(config=config)

In [None]:
# Wilcoxson

import numpy as np
import csv
import scipy
from scipy.stats import wilcoxon
import itertools

df = pd.read_csv(f'/content/drive/MyDrive/Msc Data Science/Information retrieval/Plot_project/evaluation_results/evaluation_results_20.csv').dropna()
print(df.head())


       qid   ql_ndcg  bm25_ndcg  cos_sim_ndcg  bm25Okapi  bm25_impl_one
0  1048583  0.430677   0.430677      0.430677   0.430677       0.430677
1  1048585  1.000000   1.000000      0.386853   1.000000       1.000000
2   262156  0.177184   0.386853      0.170707   0.270238       0.386853
3        2  0.386853   0.430677      0.430677   0.430677       0.430677
4   524341  0.211541   0.297572      0.261725   0.299021       0.297572


In [None]:
ql_ndcg = np.array(df["ql_ndcg"])
bm25_ndcg = np.array(df["bm25_ndcg"])
cos_sim_ndcg = np.array(df["cos_sim_ndcg"])
bm25Okapi = np.array(df["bm25Okapi"])

In [None]:
scores = [("ql_ndcg",ql_ndcg), ("bm25_ndcg",bm25_ndcg), ("cos_sim_ndcg",cos_sim_ndcg), ("bm25Okapi",bm25Okapi)]
tests = {}
for score1, score2 in itertools.combinations(scores,2):
    res = wilcoxon(score1[1], score2[1], zero_method = "zsplit")
    tests[(score1[0],score2[0])] = (res.statistic, res.pvalue)

print(tests)

{('ql_ndcg', 'bm25_ndcg'): (1006.0, 1.617986866879552e-07), ('ql_ndcg', 'cos_sim_ndcg'): (2086.5, 0.13118333736546933), ('ql_ndcg', 'bm25Okapi'): (1170.0, 2.9458778258043378e-06), ('bm25_ndcg', 'cos_sim_ndcg'): (1470.5, 0.0002835074473423671), ('bm25_ndcg', 'bm25Okapi'): (1899.0, 0.027771969422036455), ('cos_sim_ndcg', 'bm25Okapi'): (1512.5, 0.0004924247988884668)}


Code boxplot

```
import pandas as pd
import os

path = '/content/drive/MyDrive/Msc Data Science/Information retrieval/Plot_project/evaluation_results'
all_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.csv')]
dfs = {}
times = ["0.1","1","2","5","10"]
for i, file in enumerate(all_files):
    dfs[times[i]] = pd.read_csv(file)

for key, df in dfs.items():
  df = df.dropna()
  mean_df = pd.DataFrame(df.mean()).transpose()
  mean_df = mean_df.drop('bm25_impl_one',axis=1)
  df_melted = mean_df.melt(id_vars=['qid'], var_name=f'Model evaluation@{key}', value_name='Mean over queries')
  dfs[key] = mean_df

sorted_keys = list(sorted(dfs.keys()))
sorted_values = [dfs[key] for key in sorted_keys]
df_melted = pd.concat(sorted_values)
df = df_melted.drop('qid',axis=1)
df["Percentage of Documents from Collection"] = times

df = df.melt(id_vars=['Percentage of Documents from Collection'], var_name=f'Model evaluation', value_name='Average NDCG Score over Queries')

# create the bar chart
fig = px.bar(df, x='Percentage of Documents from Collection', y='Average NDCG Score over Queries', color='Model evaluation', title='Bar Chart of the NDCG Scores of Models on Percentage of Documents',barmode="group")

fig.write_html('NDCG_models_times.html')
fig.show(config=config)
```

