In [101]:
import pandas as pd
import sqlite3

#### Подключение к базе данных

In [102]:
try:
    conn = sqlite3.connect('../data/checking-logs.sqlite')
    print("Успешное подключение к базе данных.")
except sqlite3.Error as e:
    print(f"Ошибка при подключении к базе данных: {e}")
    exit()

Успешное подключение к базе данных.


#### Создание новой таблицы datamart

In [103]:
query = """
    CREATE TABLE IF NOT EXISTS datamart AS
    SELECT
        checker.uid AS uid,
        checker.labname AS labname,
        checker.timestamp AS first_commit_ts,
        MIN(pageviews.datetime) AS first_view_ts
    FROM checker 
    LEFT JOIN pageviews ON checker.uid = pageviews.uid
    WHERE checker.status = 'ready'
        AND checker.numTrials = 1
        AND checker.labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1')
        AND checker.uid LIKE 'user_%'
    GROUP BY
        checker.uid, checker.labname
"""
cursor = conn.cursor()
cursor.execute(query)
conn.commit()
print("Таблица 'datamart' создана и заполнена.")


query_select = "SELECT * FROM datamart"
datamart = pd.read_sql(query_select, conn, parse_dates=['first_commit_ts', 'first_view_ts'])
datamart


Таблица 'datamart' создана и заполнена.


Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,user_1,laba04,2020-04-26 17:06:18.462708,2020-04-26 21:53:59.624136
1,user_1,laba04s,2020-04-26 17:12:11.843671,2020-04-26 21:53:59.624136
2,user_1,laba05,2020-05-02 19:15:18.540185,2020-04-26 21:53:59.624136
3,user_1,laba06,2020-05-17 16:26:35.268534,2020-04-26 21:53:59.624136
4,user_1,laba06s,2020-05-20 12:23:37.289724,2020-04-26 21:53:59.624136
...,...,...,...,...
135,user_8,laba04s,2020-04-19 10:22:35.761944,NaT
136,user_8,laba05,2020-05-02 13:28:07.705193,NaT
137,user_8,laba06,2020-05-16 17:56:15.755553,NaT
138,user_8,laba06s,2020-05-16 20:01:07.900727,NaT


#### Создание двух фреймов данных: тестовый и контрольный

In [104]:
test = datamart[datamart['first_view_ts'].notna()].copy()
print(test)

         uid   labname            first_commit_ts              first_view_ts
0     user_1    laba04 2020-04-26 17:06:18.462708 2020-04-26 21:53:59.624136
1     user_1   laba04s 2020-04-26 17:12:11.843671 2020-04-26 21:53:59.624136
2     user_1    laba05 2020-05-02 19:15:18.540185 2020-04-26 21:53:59.624136
3     user_1    laba06 2020-05-17 16:26:35.268534 2020-04-26 21:53:59.624136
4     user_1   laba06s 2020-05-20 12:23:37.289724 2020-04-26 21:53:59.624136
5     user_1  project1 2020-05-14 20:56:08.898880 2020-04-26 21:53:59.624136
6    user_10    laba04 2020-04-25 08:24:52.696624 2020-04-18 12:19:50.182714
7    user_10   laba04s 2020-04-25 08:37:54.604222 2020-04-18 12:19:50.182714
8    user_10    laba05 2020-05-01 19:27:26.063245 2020-04-18 12:19:50.182714
9    user_10    laba06 2020-05-19 11:39:28.885637 2020-04-18 12:19:50.182714
10   user_10   laba06s 2020-05-20 07:37:31.175817 2020-04-18 12:19:50.182714
11   user_10  project1 2020-05-12 20:12:28.056618 2020-04-18 12:19:50.182714

In [105]:
control = datamart[datamart['first_view_ts'].isna()].copy()
print(control)

         uid   labname            first_commit_ts first_view_ts
12   user_11    laba05 2020-05-03 21:06:55.970293           NaT
13   user_11  project1 2020-05-03 23:45:33.673409           NaT
14   user_12    laba04 2020-04-18 17:07:51.767358           NaT
15   user_12   laba04s 2020-04-26 15:42:38.070593           NaT
16   user_12    laba05 2020-05-03 08:39:25.174316           NaT
..       ...       ...                        ...           ...
135   user_8   laba04s 2020-04-19 10:22:35.761944           NaT
136   user_8    laba05 2020-05-02 13:28:07.705193           NaT
137   user_8    laba06 2020-05-16 17:56:15.755553           NaT
138   user_8   laba06s 2020-05-16 20:01:07.900727           NaT
139   user_8  project1 2020-05-14 15:42:04.002981           NaT

[81 rows x 4 columns]


In [106]:
mean_value = test['first_view_ts'].mean()
control['first_view_ts'] = control['first_view_ts'].fillna(mean_value)
print(control)

         uid   labname            first_commit_ts  \
12   user_11    laba05 2020-05-03 21:06:55.970293   
13   user_11  project1 2020-05-03 23:45:33.673409   
14   user_12    laba04 2020-04-18 17:07:51.767358   
15   user_12   laba04s 2020-04-26 15:42:38.070593   
16   user_12    laba05 2020-05-03 08:39:25.174316   
..       ...       ...                        ...   
135   user_8   laba04s 2020-04-19 10:22:35.761944   
136   user_8    laba05 2020-05-02 13:28:07.705193   
137   user_8    laba06 2020-05-16 17:56:15.755553   
138   user_8   laba06s 2020-05-16 20:01:07.900727   
139   user_8  project1 2020-05-14 15:42:04.002981   

                    first_view_ts  
12  2020-04-27 00:40:05.761783552  
13  2020-04-27 00:40:05.761783552  
14  2020-04-27 00:40:05.761783552  
15  2020-04-27 00:40:05.761783552  
16  2020-04-27 00:40:05.761783552  
..                            ...  
135 2020-04-27 00:40:05.761783552  
136 2020-04-27 00:40:05.761783552  
137 2020-04-27 00:40:05.761783552  
138

#### Сохранение таблиц в базе данных

In [107]:
try:
    test.to_sql('test', conn, if_exists='replace', index=False)
    control.to_sql('control', conn, if_exists='replace', index=False)

    print("DataFrames 'test' и 'control' сохранены в базу данных.")
except Exception as e:
    print(f"Ошибка при сохранении dataframes в базу данных: {e}")

DataFrames 'test' и 'control' сохранены в базу данных.


#### Закрытие соединения

In [108]:
conn.close()