In [1]:
import pandas as pd
import sqlite3

In [2]:
db = sqlite3.connect('../../datasets/checking-logs.sqlite')

## Datamart table

In [3]:
db.execute(""" CREATE TABLE datamart AS
SELECT c.uid, c.labname, c.timestamp AS first_commit_ts, MIN(p.datetime) as first_view_ts FROM checker c
LEFT JOIN pageviews p ON c.uid = p.uid
WHERE c.status = 'ready' AND c.numTrials = 1 AND c.labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1') AND c.uid LIKE 'user_%'
GROUP BY c.uid, c.labname, c.timestamp """)
db.commit()

In [4]:
datamart = pd.read_sql('SELECT * FROM datamart', db, parse_dates=['first_commit_ts', 'first_view_ts'])
datamart['first_commit_ts'] = pd.to_datetime(datamart['first_commit_ts'])
datamart['first_view_ts'] = pd.to_datetime(datamart['first_view_ts'])
datamart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   uid              140 non-null    object        
 1   labname          140 non-null    object        
 2   first_commit_ts  140 non-null    datetime64[ns]
 3   first_view_ts    59 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 4.5+ KB


## Test and conrol

In [5]:
test = datamart[datamart['first_view_ts'].notna()]
control = datamart[datamart['first_view_ts'].isna()]

In [6]:
avg_view = test['first_view_ts'].mean()
control.loc[:,'first_view_ts'] = control['first_view_ts'].fillna(avg_view)
test.to_sql('test', db, if_exists = 'replace', index = False)
control.to_sql('control', db, if_exists = 'replace', index = False)

81

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59 entries, 0 to 114
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   uid              59 non-null     object        
 1   labname          59 non-null     object        
 2   first_commit_ts  59 non-null     datetime64[ns]
 3   first_view_ts    59 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 2.3+ KB


In [8]:
control.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 12 to 139
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   uid              81 non-null     object        
 1   labname          81 non-null     object        
 2   first_commit_ts  81 non-null     datetime64[ns]
 3   first_view_ts    81 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 3.2+ KB


In [9]:
db.close()