In [661]:
import pandas as pd
import sqlite3

## Создаем подключение к базе данных с помощью библиотеки sqlite3

In [662]:
conn = sqlite3.connect('../data/checking-logs.sqlite', check_same_thread=False)

## Создать новую таблицу хранилища данных в базе данных, объединив таблицы просмотров страниц и проверки, используя только один запрос

In [663]:
query = """
SELECT
    CASE 
        WHEN checker.uid LIKE 'user_%' THEN checker.uid
        ELSE NULL
    END AS uid,
    CASE 
        WHEN checker.uid LIKE 'user_%' THEN checker.labname
        ELSE NULL
    END AS labname,
    
    MIN(CASE WHEN checker.uid LIKE 'user_%' THEN checker.timestamp END) AS first_commit_ts,
    MIN(CASE WHEN checker.uid LIKE 'user_%' THEN p.datetime END) AS first_view_ts
FROM checker
LEFT JOIN pageviews p ON checker.uid = p.uid
WHERE
    checker.status = 'ready'
    AND checker.numTrials = 1
    AND checker.labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1')
   
GROUP BY checker.uid, checker.labname;
"""
datamark = pd.read_sql(query, conn, parse_dates=['first_commit_ts', 'first_view_ts'])

datamark.info()
datamark.head(15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   uid              140 non-null    object        
 1   labname          140 non-null    object        
 2   first_commit_ts  140 non-null    datetime64[ns]
 3   first_view_ts    59 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 4.7+ KB


Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,,,NaT,NaT
1,,,NaT,NaT
2,,,NaT,NaT
3,,,NaT,NaT
4,,,NaT,NaT
5,,,NaT,NaT
6,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
7,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
8,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
9,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136


## Используя методы Pandas, создайте два фрейма данных: тестовый и контрольный.

In [664]:
test = datamark[datamark['first_view_ts'].notna()].copy()
control = datamark[datamark['first_view_ts'].isna()].copy()
avg_ts = test['first_view_ts'].mean()
control['first_view_ts'] = avg_ts
control = control.dropna(subset=['uid', 'labname', 'first_commit_ts'])

In [665]:
test.info()
test.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 59 entries, 6 to 120
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   uid              59 non-null     object        
 1   labname          59 non-null     object        
 2   first_commit_ts  59 non-null     datetime64[ns]
 3   first_view_ts    59 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 2.3+ KB


Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
6,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
7,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
8,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
9,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136
10,user_1,laba06s,2020-05-20 12:23:37.289724,2020-04-26 21:53:59.624136
11,user_1,project1,2020-05-14 20:56:08.898880,2020-04-26 21:53:59.624136
12,user_10,laba04,2020-04-25 08:24:52.696624,2020-04-18 12:19:50.182714
13,user_10,laba04s,2020-04-25 08:37:54.604222,2020-04-18 12:19:50.182714
14,user_10,laba05,2020-05-01 19:27:26.063245,2020-04-18 12:19:50.182714
15,user_10,laba06,2020-05-19 11:39:28.885637,2020-04-18 12:19:50.182714


In [666]:
control.info()
control.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, 18 to 145
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   uid              81 non-null     object        
 1   labname          81 non-null     object        
 2   first_commit_ts  81 non-null     datetime64[ns]
 3   first_view_ts    81 non-null     datetime64[ns]
dtypes: datetime64[ns](2), object(2)
memory usage: 3.2+ KB


Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
18,user_11,laba05,2020-05-03 21:06:55.970293,2020-04-27 00:40:05.761783552
19,user_11,project1,2020-05-03 23:45:33.673409,2020-04-27 00:40:05.761783552
20,user_12,laba04,2020-04-18 17:07:51.767358,2020-04-27 00:40:05.761783552
21,user_12,laba04s,2020-04-26 15:42:38.070593,2020-04-27 00:40:05.761783552
22,user_12,laba05,2020-05-03 08:39:25.174316,2020-04-27 00:40:05.761783552
23,user_12,laba06,2020-05-19 08:41:51.082304,2020-04-27 00:40:05.761783552
24,user_12,project1,2020-05-14 18:17:40.702024,2020-04-27 00:40:05.761783552
25,user_13,laba04,2020-04-25 17:56:16.919886,2020-04-27 00:40:05.761783552
26,user_13,laba04s,2020-04-26 22:21:04.093297,2020-04-27 00:40:05.761783552
27,user_13,laba05,2020-05-02 19:31:35.398386,2020-04-27 00:40:05.761783552


In [667]:
#сохранение
test.to_sql('test', conn, if_exists='replace', index=False)  # Замена таблицы, если она существует
control.to_sql('control', conn, if_exists='replace', index=False)

81

## Закрыть соединение

In [668]:
conn.close()