In [251]:
import pandas as pd
import sqlite3

## create a connection to the database using the library sqlite3

In [264]:
db_connection = sqlite3.connect('../data/checking-logs.sqlite')

## get the schema of the table test

In [265]:
pd.io.sql.read_sql('PRAGMA table_info(test);', db_connection)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,uid,TEXT,0,,0
2,2,labname,TEXT,0,,0
3,3,checker_ts,TIMESTAMP,0,,0
4,4,first_view_ts,TIMESTAMP,0,,0


## get only 10 first rows of the table test to check how the table looks like

In [266]:
pd.io.sql.read_sql('select * from test limit 10', db_connection)

Unnamed: 0,index,uid,labname,checker_ts,first_view_ts
0,0,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
1,1,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
2,2,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
3,3,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136
4,4,user_1,laba06s,2020-05-20 12:23:37.289724,2020-04-26 21:53:59.624136
5,5,user_1,project1,2020-05-14 20:56:08.898880,2020-04-26 21:53:59.624136
6,6,user_10,laba04,2020-04-25 08:24:52.696624,2020-04-18 12:19:50.182714
7,7,user_10,laba04s,2020-04-25 08:37:54.604222,2020-04-18 12:19:50.182714
8,8,user_10,laba05,2020-05-01 19:27:26.063245,2020-04-18 12:19:50.182714
9,9,user_10,laba06,2020-05-19 11:39:28.885637,2020-04-18 12:19:50.182714


## find the minimum value of the difference between the first commit of the user and the deadline of the corresponding lab using only one query
- do it by joining the table with the table deadlines
- the difference should be displayed in hours
- do not take into account the lab 'project1', it has longer deadlines and will be an outlier
- the value should be stored in the dataframe df_min with the corresponding uid


In [284]:
query = '''
select
    uid,
    min(cast((JulianDay(test.checker_ts) - (JulianDay(deadlines.deadlines, 'unixepoch')))* 24 as Integer))  as diff
from test
left join deadlines
    on test.labname = deadlines.labs
where
    labname != 'project1'
'''

In [285]:
df_min = pd.io.sql.read_sql(query, db_connection)

In [286]:
df_min

Unnamed: 0,uid,diff
0,user_30,-202


## do the same thing but with the maximum using only one query, the dataframe name is df_max

In [288]:
query = '''
select
    uid,
    max(cast((JulianDay(test.checker_ts) - (JulianDay(deadlines.deadlines, 'unixepoch')))* 24 as Integer))  as diff
from test
left join deadlines
    on test.labname = deadlines.labs
where
    labname != 'project1'
'''

In [289]:
df_max = pd.io.sql.read_sql(query, db_connection)

In [290]:
df_max

Unnamed: 0,uid,diff
0,user_25,-2


## do the same thing but with the average using only one query, this time your dataframe should not include the uid column, the dataframe name is df_avg

In [303]:
query = '''
select
    avg(cast((JulianDay(test.checker_ts) - (JulianDay(deadlines.deadlines, 'unixepoch')))* 24 as Integer))  as diff
from test
left join deadlines
    on test.labname = deadlines.labs
where
    labname != 'project1'
'''

In [304]:
df_avg = pd.io.sql.read_sql(query, db_connection)

In [305]:
df_avg

Unnamed: 0,diff
0,-89.125


## we want to test the hypothesis that the users who visited Newsfeed just a few times have the lower difference between the first commit and the deadline, to do this you need to calculate the correlation coefficient between the number of the pageviews and the difference
- using only one query create a table with the columns: uid, avg_diff, pageviews
- uid is the uids that exist in the test
- avg_diff is the average difference between the first commit and the lab deadline per user
- pageviews is the number of Newsfeed visits per user
- do not take into account the lab 'project1'
- store it to the dataframe views_diff
- use the Pandas method corr() to calculate the correlation coefficient between the number of the pageviews and the difference


In [307]:
pd.io.sql.read_sql('select * from pageviews where uid like "user%"', db_connection)

Unnamed: 0,index,uid,datetime
0,12,user_30,2020-04-17 22:46:26.785035
1,14,user_14,2020-04-18 10:53:52.623447
2,15,user_17,2020-04-18 10:56:55.833899
3,16,user_14,2020-04-18 10:57:37.331258
4,18,user_17,2020-04-18 12:05:48.200144
...,...,...,...
982,1073,user_28,2020-05-21 18:45:20.441142
983,1074,user_19,2020-05-21 23:03:06.457819
984,1075,user_25,2020-05-21 23:23:49.995349
985,1078,user_5,2020-05-22 11:30:18.368990


In [351]:
query = '''

with t2 as (select
    test.uid,
    avg(cast((JulianDay(test.checker_ts) - (JulianDay(deadlines.deadlines, 'unixepoch')))* 24 as Integer))  as avg_dif
from test
inner join deadlines
    on deadlines.labs = test.labname
group by test.uid)
select
    t2.uid as uid,
    t2.avg_dif,
    count(pageviews.datetime) as pageviews
from t2
inner join pageviews
    on t2.uid = pageviews.uid
group by t2.uid
'''

In [352]:
views_df = pd.io.sql.read_sql(query, db_connection)

In [353]:
views_df.corr()

Unnamed: 0,avg_dif,pageviews
avg_dif,1.0,-0.069302
pageviews,-0.069302,1.0


## close the connection

In [250]:
db_connection.close()