In [1]:
import pandas as pd
import sqlite3

In [2]:
try:
    connection = sqlite3.connect("../data/checking-logs.sqlite")
    print("Successfully connected to database")
except FileNotFoundError as e:
    print("File not found")
    raise e

Successfully connected to database


In [3]:
qwery = """
    SELECT
        uid,
        MIN((d.deadlines - STRFTIME('%s', ts.first_commit_ts)) / 3600.0) as min_delta_hours
    FROM test ts
    JOIN deadlines d ON ts.labname = d.labs
    WHERE ts.labname != 'project1'
    GROUP BY uid
    ORDER BY min_delta_hours ASC
    LIMIT 1
        """
df_min = pd.read_sql(qwery, connection)

print("Minimal deltahours")
print(df_min)

Minimal deltahours
       uid  min_delta_hours
0  user_25           2.8675


In [4]:
qwery = """
    SELECT
        uid,
        MAX((d.deadlines - STRFTIME('%s', ts.first_commit_ts)) / 3600.0) as max_delta_hours
    FROM test ts
    JOIN deadlines d ON ts.labname = d.labs
    WHERE ts.labname != 'project1'
    GROUP BY uid
    ORDER BY max_delta_hours ASC
    LIMIT 1
        """
df_max = pd.read_sql(qwery, connection)

print("Maximum delta hours")
print(df_max)

Maximum delta hours
       uid  max_delta_hours
0  user_18        10.973611


In [5]:
query_avg = """
SELECT AVG((julianday(deadline_dt) - julianday(first_commit_ts)) * 24) as avg_delta_hours
FROM (
    SELECT
        t.uid,
        t.labname,
        t.first_commit_ts,
        datetime(d.deadlines, 'unixepoch') as deadline_dt
    FROM test t
    JOIN deadlines d ON t.labname = d.labs
    WHERE t.labname != 'project1'
)
"""

df_avg = pd.read_sql_query(query_avg, connection)
print(f"\nAverage delta (df_avg):")
print(df_avg)


Average delta (df_avg):
   avg_delta_hours
0        89.687686


In [6]:
query_correlation = """
SELECT
    t.uid,
    AVG((julianday(deadline_dt) - julianday(first_commit_ts)) * 24) as avg_diff,
    COUNT(CASE WHEN t.first_view_ts IS NOT NULL THEN 1 END) as pageviews
FROM (
    SELECT
        t.uid,
        t.labname,
        t.first_commit_ts,
        t.first_view_ts,
        datetime(d.deadlines, 'unixepoch') as deadline_dt
    FROM test t
    JOIN deadlines d ON t.labname = d.labs
    WHERE t.labname != 'project1'
) t
GROUP BY t.uid
"""

views_diff = pd.read_sql_query(query_correlation, connection)
print(f"\nviews_diff data:")
print(views_diff.head())
print(f"Shape: {views_diff.shape}")


views_diff data:
       uid    avg_diff  pageviews
0   user_1   65.119644          5
1  user_10   75.242310          5
2  user_14  159.568696          3
3  user_17   62.207514          5
4  user_18    6.367907          3
Shape: (11, 3)


In [7]:
correlation = views_diff[['pageviews', 'avg_diff']].corr()
correlation_coefficient = correlation.loc['pageviews', 'avg_diff']

print(f"\nCorrelation coefficient between pageviews and avg_diff: {correlation_coefficient:.4f}")
print(f"\nFull correlation matrix:")
print(correlation)

# Interpretation of the correlation
if correlation_coefficient < 0:
    print(f"\nThere is a negative correlation ({correlation_coefficient:.4f})")
elif correlation_coefficient > 0:
    print(f"\nThere is a positive correlation ({correlation_coefficient:.4f})")
else:
    print(f"\nThere is no correlation ({correlation_coefficient:.4f})")
    print("This suggests no relationship between pageviews and time differences.")


Correlation coefficient between pageviews and avg_diff: -0.1184

Full correlation matrix:
           pageviews  avg_diff
pageviews   1.000000 -0.118374
avg_diff   -0.118374  1.000000

There is a negative correlation (-0.1184)


In [8]:
connection.close()
print(f"Database connection closed successfully\n")

Database connection closed successfully

