In [3]:
import psycopg2
import pandas as pd
from django.conf import settings

In [4]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [6]:
create_table_query = '''CREATE TABLE EDM2020_2020_02_19
    (EXPERIMENT_ID SERIAL PRIMARY KEY,
    DATASET TEXT NOT NULL,
    Q_MATRIX REAL ARRAY NOT NULL,
    X REAL ARRAY NULL,
    SP REAL ARRAY NOT NULL,
    X_TEST REAL ARRAY NULL,
    SP_TEST REAL ARRAY NOT NULL,
    SP_HAT REAL ARRAY NOT NULL,
    SK_HAT REAL ARRAY NOT NULL,
    Q_MATRIX_HAT REAL ARRAY NOT NULL,
    MU REAL NULL,
    CONCEPTS INTEGER NOT NULL,
    ATTEMPTS_TRAIN INTEGER NOT NULL,
    METHOD TEXT NOT NULL,
    Q_MATRIX_ERROR REAL NOT NULL,
    Q_MATRIX_RMSE REAL NOT NULL,
    RECONSTRUCTION_ERROR REAL NOT NULL,
    TRAIN_ERROR REAL NOT NULL,
    TEST_ERROR REAL NOT NULL,
    TRAIN_AUC REAL NULL,
    SECONDS INTEGER NOT NULL)
    '''

cursor.execute(create_table_query)
connection.commit()

In [5]:
create_table_query = '''CREATE TABLE EXPERIMENTS_2020_04_15
    (EXPERIMENT_ID SERIAL PRIMARY KEY,
    DATASET TEXT NOT NULL,
    X REAL ARRAY NOT NULL,
    Y REAL ARRAY NOT NULL,
    VECTORIZER TEXT NOT NULL,
    IS_BINARY BOOLEAN NOT NULL,
    MIN_DF REAL NOT NULL,
    NGRAMS INTEGER ARRAY NOT NULL,
    TOKEN INTEGER NOT NULL,
    DISTANCE TEXT NOT NULL,
    K INTEGER NOT NULL,
    METHOD TEXT NOT NULL,
    MODEL  bytea NOT NULL,
    CLUSTERING_TIME INTEGER NOT NULL,
    COHERENCE_SAMPLES5 REAL ARRAY NOT NULL,
    COHERENCE_SAMPLES10 REAL ARRAY NOT NULL,
    COHERENCE_SAMPLES15 REAL ARRAY NOT NULL,
    COHERENCE_MED5 REAL NOT NULL,
    COHERENCE_MED10 REAL NOT NULL,
    COHERENCE_MED15 REAL NOT NULL,
    COHERENCE_STD5 REAL NOT NULL,
    COHERENCE_STD10 REAL NOT NULL,
    COHERENCE_STD15 REAL NOT NULL,
    COHERENCE_TIME REAL NOT NULL,
    COHERENCE_K5 INTEGER NOT NULL,
    COHERENCE_K10 INTEGER NOT NULL,
    COHERENCE_K15 INTEGER NOT NULL)
    '''

cursor.execute(create_table_query)
connection.commit()

In [54]:
create_table_query = '''CREATE TABLE EXPERIMENTS_STABILITY_STATISTICS
    (STABILITY_ID SERIAL PRIMARY KEY,
    VECTORIZER TEXT NOT NULL,
    IS_BINARY BOOLEAN NOT NULL,
    MIN_DF REAL NOT NULL,
    DISTANCE TEXT NOT NULL,
    K INTEGER NOT NULL,
    METHOD TEXT NOT NULL,
    SAMPLES REAL ARRAY NOT NULL,
    MEAN REAL NOT NULL,
    STD REAL NOT NULL,
    MEDIAN REAL NOT NULL,
    IQR REAL NOT NULL
    )
    '''

cursor.execute(create_table_query)
connection.commit()

In [4]:
cols = ['dataset', 'k', 'min_df', 'is_binary', 'vectorizer', 'method', 'distance', 'clustering_time', 'coherence_time']
select_query = '''SELECT %s FROM EXPERIMENTS_STABILITY''' % ", ".join(cols)
cursor.execute(select_query)
r = cursor.fetchall()
# print(r)

In [5]:
df = pd.DataFrame(r, columns=cols)

In [6]:
df['total_time'] = df.sum(axis=1)

In [7]:
df[(df['dataset'].str.contains('dataset'))].sort_values(['k','min_df', 'is_binary','vectorizer'])
#    & (df['vectorizer'] == 'TfidfVectorizer') & (df['is_binary'] == True) & (df['distance'] == 'euclidean')
# df

Unnamed: 0,dataset,k,min_df,is_binary,vectorizer,method,distance,clustering_time,coherence_time,total_time
29,dataset0,2,0.05,False,CountVectorizer,nmf,euclidean,0,0.023457,2.073457
30,dataset0,2,0.05,False,CountVectorizer,lda,euclidean,0,0.012981,2.062981
1587,dataset1,2,0.05,False,CountVectorizer,nmf,euclidean,0,0.033381,2.083381
1599,dataset1,2,0.05,False,CountVectorizer,lda,euclidean,0,0.013175,2.063175
3270,dataset2,2,0.05,False,CountVectorizer,nmf,euclidean,0,0.033350,2.083350
3282,dataset2,2,0.05,False,CountVectorizer,lda,euclidean,0,0.011714,2.061714
5125,dataset3,2,0.05,False,CountVectorizer,nmf,euclidean,0,0.032648,2.082648
5126,dataset3,2,0.05,False,CountVectorizer,lda,euclidean,0,0.013141,2.063141
6648,dataset4,2,0.05,False,CountVectorizer,nmf,euclidean,0,0.031143,2.081143
6812,dataset4,2,0.05,False,CountVectorizer,lda,euclidean,0,0.013102,2.063102


In [293]:
# c5_df[['min_df', 'is_binary', 'vectorizer', 'method', 'distance', 'k']].drop_duplicates()

Unnamed: 0,min_df,is_binary,vectorizer,method,distance,k
0,0.05,True,TfidfVectorizer,lda,euclidean,13
1,0.05,True,TfidfVectorizer,lda,euclidean,11
2,0.35,True,NCutVectorizer,nmf,euclidean,7
3,0.40,True,CountVectorizer,nmf,euclidean,13
4,0.50,False,NCutVectorizer,nmf,euclidean,12
5,0.45,False,NCutVectorizer,nmf,euclidean,15
6,0.50,False,CountVectorizer,nmf,euclidean,11
7,0.05,True,CountVectorizer,nmf,euclidean,15
8,0.20,False,CountVectorizer,nmf,euclidean,15
9,0.05,True,CountVectorizer,lda,euclidean,13


In [3]:
drop_query = '''DROP TABLE EXPERIMENTS_2020_01_27'''
cursor.execute(drop_query)
connection.commit()

In [295]:
create_results_table = '''CREATE TABLE EXPERIMENTS_SOLUTION_RESULT
    (EXPERIMENT_ID INTEGER REFERENCES EXPERIMENTS(EXPERIMENT_ID),
    GAP_K INTEGER NOT NULL,
    GAP_K2 INTEGER NOT NULL,
    SILHOUETTE_K INTEGER NOT NULL,
    SILHOUETTE_K2 INTEGER NOT NULL,
    COHERENCE_K5 INTEGER NOT NULL,
    COHERENCE_K2_5 INTEGER NOT NULL,
    COHERENCE_K10 INTEGER NOT NULL,
    COHERENCE_K2_10 INTEGER NOT NULL,
    COHERENCE_K15 INTEGER NOT NULL,
    COHERENCE_K2_15 INTEGER NOT NULL)
    '''

cursor.execute(create_results_table)
connection.commit()

In [296]:
alter_results_table = """ALTER TABLE EXPERIMENTS_SOLUTION_RESULT
 ADD CONSTRAINT UC_SOLUTION_RESULT UNIQUE (EXPERIMENT_ID)"""

cursor.execute(alter_results_table)
connection.commit()

In [30]:
# cols_result = ["gap_k", "gap_k2", "silhouette_k", "silhouette_k2", "coherence_k", "coherence_k2"]
# cols_result_sql = ["experiments_result.%s" % item for item in cols_result]
cols_experiment = ['experiment_id', 'k', 'dataset', 'min_df', 'is_binary', 'vectorizer', 'method', 'distance', 'coherence_med5', 'coherence_med10','coherence_med15']
cols_experiment_sql = ["EXPERIMENTS_STABILITY.%s" % item for item in cols_experiment]

select_query = '''SELECT %s FROM EXPERIMENTS_STABILITY where dataset like 'dataset%%' order by coherence_med5 desc, coherence_med15 desc, coherence_med10 desc  '''  % ", ".join(cols_experiment_sql)
# select_query = '''SELECT %s FROM EXPERIMENTS_STABILITY where dataset like 'dataset%%' order by coherence_med10 desc, coherence_med5 desc, coherence_med15 desc  '''  % ", ".join(cols_experiment_sql)
# select_query = '''SELECT %s FROM EXPERIMENTS_STABILITY where dataset like 'dataset%%' order by coherence_med15 desc, coherence_med5 desc, coherence_med10 desc  '''  % ", ".join(cols_experiment_sql)
cursor.execute(select_query)
r = cursor.fetchall()

In [31]:
c5_df = pd.DataFrame(r, columns=cols_experiment).dropna()

In [32]:
c5_df

Unnamed: 0,experiment_id,k,dataset,min_df,is_binary,vectorizer,method,distance,coherence_med5,coherence_med10,coherence_med15
11,6178,10,dataset3,0.05,True,NCutVectorizer,lda,euclidean,0.009950,-0.130145,-0.235746
12,7950,14,dataset4,0.10,False,NCutVectorizer,lda,euclidean,0.009950,-0.254308,-0.672025
13,14572,7,dataset8,0.05,True,NCutVectorizer,lda,euclidean,0.009950,-0.122830,-1.308660
14,6212,13,dataset3,0.05,False,NCutVectorizer,lda,euclidean,0.007469,0.007469,-0.331439
15,11362,12,dataset6,0.15,False,NCutVectorizer,lda,euclidean,0.006639,-0.047629,-0.663792
16,9534,8,dataset5,0.05,True,NCutVectorizer,lda,euclidean,0.006639,-0.307062,-1.048600
17,7778,12,dataset4,0.45,False,TfidfVectorizer,lda,euclidean,0.005974,-0.364337,-1.638310
18,8288,15,dataset4,0.40,False,NCutVectorizer,lda,euclidean,0.005808,-0.046629,-0.182594
19,16324,15,dataset9,0.10,True,NCutVectorizer,lda,euclidean,0.005689,-0.093200,-0.480662
20,6238,12,dataset3,0.10,True,NCutVectorizer,lda,euclidean,0.005689,-0.093200,-0.645488


In [29]:
c10_df

Unnamed: 0,experiment_id,k,dataset,min_df,is_binary,vectorizer,method,distance,coherence_med5,coherence_med10,coherence_med15
11,6178,10,dataset3,0.05,True,NCutVectorizer,lda,euclidean,0.009950,-0.130145,-0.235746
12,7950,14,dataset4,0.10,False,NCutVectorizer,lda,euclidean,0.009950,-0.254308,-0.672025
13,14572,7,dataset8,0.05,True,NCutVectorizer,lda,euclidean,0.009950,-0.122830,-1.308660
14,6212,13,dataset3,0.05,False,NCutVectorizer,lda,euclidean,0.007469,0.007469,-0.331439
15,11362,12,dataset6,0.15,False,NCutVectorizer,lda,euclidean,0.006639,-0.047629,-0.663792
16,9534,8,dataset5,0.05,True,NCutVectorizer,lda,euclidean,0.006639,-0.307062,-1.048600
17,7778,12,dataset4,0.45,False,TfidfVectorizer,lda,euclidean,0.005974,-0.364337,-1.638310
18,8288,15,dataset4,0.40,False,NCutVectorizer,lda,euclidean,0.005808,-0.046629,-0.182594
19,16324,15,dataset9,0.10,True,NCutVectorizer,lda,euclidean,0.005689,-0.093200,-0.480662
20,6238,12,dataset3,0.10,True,NCutVectorizer,lda,euclidean,0.005689,-0.093200,-0.645488


In [21]:
c15_df

Unnamed: 0,experiment_id,k,dataset,min_df,is_binary,vectorizer,method,distance,coherence_med5,coherence_med10,coherence_med15
0,15144,13,dataset9,0.05,True,CountVectorizer,lda,euclidean,0.004988,0.004988,0.004988
1,8424,13,dataset5,0.05,True,CountVectorizer,lda,euclidean,0.003328,0.003328,0.003328
2,11778,10,dataset7,0.05,True,CountVectorizer,lda,euclidean,0.002912,0.002912,0.002912
3,3440,13,dataset2,0.10,True,CountVectorizer,lda,euclidean,0.002497,0.002497,0.002497
4,14,8,dataset0,0.05,True,CountVectorizer,lda,euclidean,0.003328,0.003328,0.001665
5,10102,12,dataset6,0.05,True,CountVectorizer,lda,euclidean,0.001665,0.001665,-0.010445
6,8418,10,dataset5,0.05,True,CountVectorizer,lda,euclidean,0.002081,0.001332,-0.013532
7,12058,10,dataset7,0.30,True,CountVectorizer,lda,euclidean,0.001832,0.001457,-0.014226
8,6744,13,dataset4,0.05,True,CountVectorizer,lda,euclidean,0.002497,0.002497,-0.015455
9,6747,15,dataset4,0.05,True,CountVectorizer,nmf,euclidean,0.003328,0.003328,-0.016571


In [43]:
c5_df[c5_df['experiment_id'] == 8424]

Unnamed: 0,experiment_id,k,dataset,min_df,is_binary,vectorizer,method,distance,coherence_med5,coherence_med10,coherence_med15
153,8424,13,dataset5,0.05,True,CountVectorizer,lda,euclidean,0.003328,0.003328,0.003328


In [65]:
import math
from itertools import combinations
TOP = 0.01
total = c5_df.shape[0]
rows = math.ceil(total * TOP)
print(rows)
ids = {}
# ids['sil'] = set(sil_df['experiment_id'].head(rows).values.tolist())
ids['c5'] = set(c5_df['experiment_id'].head(rows).values.tolist())
ids['c10'] = set(c10_df['experiment_id'].head(rows).values.tolist())
ids['c15'] = set(c15_df['experiment_id'].head(rows).values.tolist())
# print(len(ids['sil']))
combs = combinations(ids.keys(), 2)
for comb in combs:
    print("Merge set for %s and %s" % comb)
    merge_ids = ids[comb[0]] & ids[comb[1]]
    print("Size: %d" % len(merge_ids))
    print("Ids: %s" % merge_ids)
    print()
    
combs = combinations(ids.keys(), 3)
for comb in combs:
    print("Merge set for %s, %s and %s" % comb)
    merge_ids = ids[comb[0]] & ids[comb[1]] & ids[comb[2]]
    print("Size: %d" % len(merge_ids))
    print("Ids: %s" % merge_ids)
    print()

168
Merge set for c5 and c10
Size: 168
Ids: {6657, 6659, 11272, 11787, 9741, 14, 9744, 11280, 12311, 27, 1564, 10270, 13855, 6178, 9763, 6180, 6182, 551, 2095, 11311, 6711, 6713, 13377, 6210, 13379, 6212, 16461, 1614, 1617, 1619, 6747, 4701, 6238, 8288, 11362, 7778, 3172, 16483, 7271, 12906, 8815, 4723, 13431, 13433, 6271, 1671, 1673, 12940, 8337, 8339, 13467, 11421, 1182, 13991, 1707, 3246, 15535, 11443, 7860, 2231, 7866, 188, 3775, 12991, 8391, 8393, 11470, 1231, 15057, 15059, 14036, 3297, 3299, 8424, 9962, 8427, 14572, 6381, 14576, 13046, 8951, 10495, 2816, 6403, 2822, 15111, 4872, 15113, 2826, 7950, 7951, 14100, 3351, 3353, 14620, 10017, 10019, 15142, 15144, 15146, 15147, 13101, 14640, 15671, 3384, 3387, 2876, 1341, 9534, 13123, 3911, 9544, 9548, 5455, 14671, 1363, 10071, 10073, 16732, 2911, 16737, 16739, 868, 9580, 4977, 4979, 16246, 10107, 8061, 9598, 16260, 10631, 4494, 12175, 8083, 13206, 16791, 16793, 415, 9631, 5031, 5033, 11697, 4530, 11699, 16310, 16314, 14781, 16324, 4552,

In [73]:
c5_df[(c5_df['min_df'] == 0.05) & (c5_df['k'] == 12)]

Unnamed: 0,experiment_id,k,dataset,min_df,is_binary,vectorizer,method,distance,coherence_med5,coherence_med10,coherence_med15
58,2822,12,dataset1,0.05,True,NCutVectorizer,lda,euclidean,0.004988,-0.161613,-0.198245
66,6182,12,dataset3,0.05,True,NCutVectorizer,lda,euclidean,0.004988,0.004988,-0.242389
69,6210,12,dataset3,0.05,False,NCutVectorizer,lda,euclidean,0.004988,-0.208818,-0.301111
70,4530,12,dataset2,0.05,False,NCutVectorizer,lda,euclidean,0.004988,0.004988,-0.373820
104,11222,12,dataset6,0.05,True,NCutVectorizer,lda,euclidean,0.004988,-0.120204,-1.510240
148,5090,12,dataset3,0.05,False,CountVectorizer,lda,euclidean,0.003742,-0.072387,-0.141878
167,15142,12,dataset9,0.05,True,CountVectorizer,lda,euclidean,0.003328,-0.038266,-0.109791
190,12902,12,dataset7,0.05,True,NCutVectorizer,lda,euclidean,0.003328,0.003328,-0.187307
394,9542,12,dataset5,0.05,True,NCutVectorizer,lda,euclidean,0.002598,-0.055691,-0.488447
414,5062,12,dataset3,0.05,True,CountVectorizer,lda,euclidean,0.002497,0.001998,-0.016571


In [1]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=last_id)
# solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 591
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


In [29]:
def get_where_items(exp_id):
    cols = ["vectorizer", "min_df", "is_binary", "distance", "method", "dataset", "k", "model", "X"]
    query = "SELECT %s from EXPERIMENTS_STABILITY where experiment_id = %s" % (", ".join(cols), exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def analyze(solutions, where_items, exp_id):
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    dist = where_items[0][3]
    method = where_items[0][4]
    k = where_items[0][6]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    X = np.asarray(where_items[0][8])

    train_data_features, vectorizer, feature_names = create_bag_of_words(solutions, v, binary=b, min_df=m)
    clustering = Clustering(train_data_features, k, metric=dist)
    clustering.seed = model_db.random_state
    
    model, document_topic, word_topic = getattr(clustering, method)()
    
#     savefig='problems_20190101_%s_%s_exp_%s_%s_document_topic_distribution' % (method, clustering.seed, 
#                                                                                    exp_id, clustering.k)
#     clustering.plot_topic_distribution(title="Topic distribution per document", ylabel="Document ID", savefig=savefig, cmap='Blues')

#     print("Count per class:")
#     clusters = clustering.document_topic.argmax(axis=1)
#     counts = np.unique(clusters, return_counts=True)
#     print(counts)
    return clustering, method, feature_names, model

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import base64
import numpy as np
from tokenizer import create_bag_of_words
from clustering import Clustering


# Get experiment conditions
exp_id = 15142
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

v = where_items[0][0]
m = where_items[0][1]
b = where_items[0][2]
dist = where_items[0][3]
method = where_items[0][4]
k = where_items[0][6]

query = "SELECT experiment_id from EXPERIMENTS_STABILITY where "
where_clause = [("vectorizer", v), ("min_df", m), ("is_binary", b), 
                ("distance", dist), ("method", method), ("k", k)]
query += " AND ".join(["%s = '%s'" % item for item in where_clause])
cursor.execute(query)
r = cursor.fetchall()

clusters = []

for exp_id in r:
    where_items = get_where_items(exp_id[0])
    print("Conditions")
    print(where_items[0][0:7])


    clustering, method, feature_names, model = analyze(solutions, where_items, exp_id)
    clusters.append(clustering.document_topic.argmax(axis=1))

Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset9', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset0', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset1', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset2', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset3', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset4', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset5', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset6', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset7', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset8', 12)
Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'dataset9', 12)


In [31]:
clusters

[array([ 5,  5, 10,  5,  7,  5,  5,  1,  1,  1,  5,  1,  5,  0,  0,  1,  5,
         4,  7,  2,  4,  7,  7,  5,  7,  7, 10, 10,  4,  1,  4,  2,  0,  1,
         9,  1,  2,  0,  1,  5,  5,  5,  1,  1,  5,  5,  4,  9, 11, 11,  5,
         4,  2,  5]),
 array([ 0,  8,  2,  0,  0,  0,  0,  3,  3,  1,  2,  9, 11,  8,  7,  1,  2,
        11,  7,  0,  0,  7,  7,  2,  7,  7,  2,  2, 11,  9, 11, 11, 11,  1,
        10,  9, 10,  1,  1,  8,  8,  8,  1,  1, 11, 11, 10, 10,  4, 11,  8,
         8, 11,  0]),
 array([ 7, 10,  2,  7,  7,  7,  7, 11, 11,  7,  2,  7,  7, 11, 11,  7,  2,
        11,  8,  7,  7,  8,  8,  2,  8,  8,  2,  2, 11,  7, 11,  7,  7,  7,
         7,  7, 11,  7,  7, 10, 10, 11,  7,  0,  2,  2,  1,  1,  1, 10, 11,
        11,  2,  7]),
 array([10, 10,  3, 10, 10, 10, 10,  6,  6,  0,  3,  0, 10,  3,  6,  0,  7,
         3,  3,  9,  9,  2,  2,  3,  3,  2,  3,  3,  6,  0,  6,  9,  3,  0,
         0,  0,  9,  9,  0,  8,  8,  8,  0,  4,  1,  1,  0,  0,  9,  5,  8,
         9,  9, 10]),


In [50]:
from itertools import combinations
from sklearn.metrics.cluster import normalized_mutual_info_score


scores = []
ij = combinations(range(len(clusters)), 2)
for i, j in ij:
    scores.append(normalized_mutual_info_score(clusters[i], clusters[j], average_method='arithmetic'))

In [41]:
from scipy import stats
stats.describe(np.asarray(scores))

DescribeResult(nobs=45, minmax=(0.36006417231978466, 0.6798696013086902), mean=0.5361073319921684, variance=0.006867014084198572, skewness=-0.4436793753151618, kurtosis=-0.7683020197444734)

In [51]:
stats.describe(np.asarray(scores))

DescribeResult(nobs=45, minmax=(0.35750932735287044, 0.6786866711797224), mean=0.5345179878257761, variance=0.0068491350893785715, skewness=-0.4394407907629835, kurtosis=-0.7510573297358971)

In [52]:
scores = np.asarray(scores)
mean = scores.mean()
std = scores.std()
median = np.median(scores)
q75, q25 = np.percentile(scores, [75, 25])
iqr = q75 - q25

print(mean)
print(std)
print(median)
print(iqr)

0.5345179878257761
0.0818347853140239
0.5509299593872623
0.10702881637853245
