<a href="https://colab.research.google.com/github/Nemczek/checkio_database/blob/main/pyCheckio_class_database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project pyCheckio

This project downloads data about our class from [CheckIo](https://checkio.org) and stores it in SQLite database.

## Setting up

In [29]:
# importing libraries
import requests
import pandas as pd
from collections import Counter

In [30]:
# static variables
BASE_URL = 'https://py.checkio.org/api/group-details/'
GROUP_PROGRESS_API_BASE = 'https://py.checkio.org/api/group-progress/'
GROUP_ACTIVITY_API_BASE = 'https://py.checkio.org/api/group-activity/'
# Here you need to put your personal group token
TOKEN = '?token=277869ee829d44609019622889900651'

URL_WITH_TOKEN = BASE_URL + TOKEN
PROGRESS_API_WITH_TOKEN = GROUP_PROGRESS_API_BASE + TOKEN
ACTIVITY_API_WITH_TOKEN = GROUP_ACTIVITY_API_BASE + TOKEN

In [31]:
# listing all classes
requests.get(URL_WITH_TOKEN).json()['objects']

[{'name': '2021-2022 Jezyki programowania 1',
  'slug': 'jezyki-programowania-1',
  'default_language': 'en',
  'description_text': '',
  'created_at': '2021-10-06',
  'membership_admission': 'Anyone but the approval is required',
  'is_auto_following_enabled': True,
  'is_email_review_activated': True,
  'is_group_default_filter': False,
  'is_full_access': False,
  'is_leaderboard_enabled': True,
  'is_monthly_leaderboard_default': False,
  'is_revisions_enabled': True,
  'is_member_profiles_visible_for_site': False,
  'is_member_solutions_visible_for_site': False,
  'is_site_solutions_visible_for_members': True,
  'members_count': 25,
  'last_activity': '2023-03-24 12:41',
  'course': 'CheckiO [Easy] (no points for votes)',
  'is_current_group': False},
 {'name': 'stara Języki programowania 1 grupa 2',
  'slug': 'jezyki-programowania-1-grupa-2',
  'default_language': 'en',
  'description_text': '',
  'created_at': '2021-02-26',
  'membership_admission': 'Only by invitation via email

In [32]:
def get_slug(url):
  """
  Returns slag parameter of our current class

  Parameters:
  url (str): URL to our class details

  Returns:
  slug (str): the slug parameter
  """
  slug = requests.get(url).json()['objects'][4]['slug'] # 4 is the index of our class
  return slug
print(get_slug(URL_WITH_TOKEN))

michal-wojcik-2022-2023


## Activity API

In [33]:
# This API gives us acces to the latest activity of users
class_slug = get_slug(URL_WITH_TOKEN)
activity_url_with_slug = f"{ACTIVITY_API_WITH_TOKEN}&slug={class_slug}"

resp = requests.get(activity_url_with_slug).json()['objects']
resp[2]

{'username': '126086',
 'createdAt': '2023-01-09',
 'data': {'task': {'imageUrl': 'https://d17mnqrx9pmt3e.cloudfront.net/media/logos/task/normal/password-enabled.png',
   'shortText': 'Verify password by condition\n',
   'subject': 'Acceptable Password I',
   'type': 'task',
   'url': '/mission/acceptable-password-i/'},
  'type': 'implementation',
  'user': {'avatarUrl': 'https://www.gravatar.com/avatar/c13a3f1c3609338a3cc532e0eacc70eb?s=80',
   'level': 4,
   'username': '126086',
   'group': {'name': 'Michal Wojcik 2022-2023',
    'url': '/class/michal-wojcik-2022-2023/',
    'owner': 'MichalRyszardWojcik',
    'slug': 'michal-wojcik-2022-2023'},
   'type': 'user',
   'url': '/user/126086/'},
  'url': '/class/michal-wojcik-2022-2023/solution-history/3510415/'}}

## Progress API

This is main target of this project since all intresting data is here

In [34]:
# Get data from API
progress_url_with_slug = f"{PROGRESS_API_WITH_TOKEN}&slug={class_slug}"
progress_data = requests.get(progress_url_with_slug).json()['objects']

In [35]:
progress_data[0] # Look on data

{'title': 'Multiply (Intro)',
 'slug': 'multiply-intro',
 'data': [{'username': 'karol2202',
   'openedAt': '2022-11-19 14:11',
   'startedAt': '2022-11-19 14:12',
   'solvedAt': '2022-11-19 14:13',
   'status': 'published',
   'solutions': []},
  {'username': 'Antoni_Wojcik',
   'openedAt': '2022-10-06 21:53',
   'startedAt': '2022-10-06 21:54',
   'solvedAt': '2022-10-23 19:30',
   'status': 'published',
   'solutions': [{'name': 'Multiply',
     'url': 'https://py.checkio.org/mission/multiply-intro/publications/Antoni_Wojcik/python-3/first/',
     'createdAt': '2022-11-21 14:02',
     'votes': 9,
     'comments': 0,
     'isRead': False}]},
  {'username': '117374',
   'openedAt': '2022-10-06 21:33',
   'startedAt': '2022-10-09 12:58',
   'solvedAt': '2022-10-09 12:59',
   'status': 'published',
   'solutions': [{'name': 'First attempt',
     'url': 'https://py.checkio.org/mission/multiply-intro/publications/117374/python-3/first-attempt/',
     'createdAt': '2022-11-21 15:31',
     

## Quest dataset

In [36]:
# unique statuses
statuses = []
for task in progress_data:
  for user in task['data']:
    statuses.append(user['status'])
set(statuses)

{'new', 'opened', 'published', 'tried'}

In [37]:
# Extract data about tasks solved by students to list of lists
list_of_tasks = []
for task in progress_data:
  num_of_votes = 0
  num_of_comments = 0
  num_of_tries = 0
  list_of_statuses = []

  for entry in task['data']:
    list_of_statuses.append(entry['status'])

    for solution in entry['solutions']:
      num_of_votes += solution['votes']
      num_of_comments += solution['comments']

  counter_object = Counter(list_of_statuses)
  list_of_tasks.append([task['title'], num_of_votes, num_of_comments,
                        counter_object['opened'], counter_object['published'],
                        counter_object['tried'], counter_object['new']])
list_of_tasks

[['Multiply (Intro)', 63, 14, 0, 42, 2, 5],
 ['Acceptable Password I', 46, 37, 0, 43, 0, 6],
 ['Is Even', 79, 16, 0, 43, 0, 6],
 ['First Word (simplified)', 63, 14, 0, 41, 0, 8],
 ['Number Length', 60, 15, 0, 41, 0, 8],
 ['Backward String', 84, 9, 0, 40, 0, 9],
 ['First Word', 30, 5, 0, 40, 0, 9],
 ['Three Words', 55, 5, 0, 40, 0, 9],
 ['Beginning Zeros', 69, 4, 0, 39, 0, 10],
 ['Between Markers (simplified)', 59, 3, 1, 37, 2, 9],
 ['Max Digit', 59, 3, 0, 39, 0, 10],
 ['Correct Sentence', 21, 0, 0, 38, 0, 11],
 ['Easy Unpack', 32, 5, 0, 38, 0, 11],
 ['End Zeros', 46, 8, 0, 38, 0, 11],
 ['Sum Numbers', 49, 5, 3, 38, 0, 8],
 ['Acceptable Password II', 38, 6, 0, 37, 0, 12],
 ['Acceptable Password III', 36, 7, 0, 37, 0, 12],
 ['Acceptable Password IV', 33, 2, 0, 37, 0, 12],
 ['Acceptable Password V', 28, 1, 0, 37, 0, 12],
 ['All the Same', 41, 7, 0, 37, 0, 12],
 ['All Upper I', 57, 6, 0, 37, 0, 12],
 ['Duplicate Zeros', 46, 6, 2, 35, 2, 10],
 ['Even the Last', 48, 2, 0, 37, 0, 12],
 ['Righ

In [38]:
# Change list of lists to pandas DataFrame
task_data = pd.DataFrame(list_of_tasks, columns=['Task', 'Votes', 'Comments',
                                                 'Opened', 'Published', 'Tried',
                                                 'New'])
task_data

Unnamed: 0,Task,Votes,Comments,Opened,Published,Tried,New
0,Multiply (Intro),63,14,0,42,2,5
1,Acceptable Password I,46,37,0,43,0,6
2,Is Even,79,16,0,43,0,6
3,First Word (simplified),63,14,0,41,0,8
4,Number Length,60,15,0,41,0,8
...,...,...,...,...,...,...,...
243,Weak Point,0,0,1,1,0,47
244,Working Hours Calculator,0,0,3,1,0,45
245,Work Schedule Generator,0,0,2,0,1,46
246,Xs and Os Champion,0,0,6,0,1,42


## Every user attempt dataset

In [39]:
# Extract data about every student's attempt to solve task to list of lists
list_of_entries = []

for task in progress_data:
  task_name = task['title']

  for entry in task['data']:
    username = entry['username']
    status = entry['status']

    if len(entry['solutions']) == 0:
      url, createdAt, votes, comments = "None", "None", "None", "None"
    else:
      # I'm taking only first solution
      url = entry['solutions'][0]['url']
      createdAt = entry['solutions'][0]['createdAt']
      votes = entry['solutions'][0]['votes']
      comments = entry['solutions'][0]['comments']

    list_of_entries.append([username, status, task_name, createdAt, votes, comments, url])


In [40]:
# Convert to pandas data frame
entry_df = pd.DataFrame(list_of_entries, columns=['username', 'status',
                                                  'task_name', 'createdAt',
                                                  'votes', 'comments', 'url'])
entry_df

Unnamed: 0,username,status,task_name,createdAt,votes,comments,url
0,karol2202,published,Multiply (Intro),,,,
1,Antoni_Wojcik,published,Multiply (Intro),2022-11-21 14:02,9,0,https://py.checkio.org/mission/multiply-intro/...
2,117374,published,Multiply (Intro),2022-11-21 15:31,0,0,https://py.checkio.org/mission/multiply-intro/...
3,126212,published,Multiply (Intro),,,,
4,Karolina_Zadura,published,Multiply (Intro),2022-11-21 15:39,0,0,https://py.checkio.org/mission/multiply-intro/...
...,...,...,...,...,...,...,...
12147,115128,new,YAML. More Types,,,,
12148,126089,new,YAML. More Types,,,,
12149,AlicjaKraska,new,YAML. More Types,,,,
12150,Mariia_Salganik,new,YAML. More Types,,,,


## Exporting data to SQL

In [41]:
%%capture
db_name = "checkio_class.db"

%load_ext sql
%sql sqlite:///{db_name}

import sqlalchemy as db
engine = db.create_engine(f'sqlite:///{db_name}')

entry_df.to_sql('entry_df', engine, index=False)
task_data.to_sql('task_data', engine, index=False)
# This code whill throw an error if database alredy exists.

In [42]:
# Some tests to check if everything went well

import sqlite3 as sq
connection = sq.connect(db_name)
cursor = connection.cursor()

query = "SELECT * from task_data;"
result = cursor.execute(query)
rows = result.fetchall()

pd.DataFrame(rows, columns=map(lambda x: x[0], result.description))

Unnamed: 0,Task,Votes,Comments,Opened,Published,Tried,New
0,Multiply (Intro),63,14,0,42,2,5
1,Acceptable Password I,46,37,0,43,0,6
2,Is Even,79,16,0,43,0,6
3,First Word (simplified),63,14,0,41,0,8
4,Number Length,60,15,0,41,0,8
...,...,...,...,...,...,...,...
243,Weak Point,0,0,1,1,0,47
244,Working Hours Calculator,0,0,3,1,0,45
245,Work Schedule Generator,0,0,2,0,1,46
246,Xs and Os Champion,0,0,6,0,1,42


In [43]:
query2 = 'SELECT * FROM entry_df'
result2 = cursor.execute(query2)
rows2 = result2.fetchall()

pd.DataFrame(rows2, columns=map(lambda x: x[0], result2.description))

Unnamed: 0,username,status,task_name,createdAt,votes,comments,url
0,karol2202,published,Multiply (Intro),,,,
1,Antoni_Wojcik,published,Multiply (Intro),2022-11-21 14:02,9,0,https://py.checkio.org/mission/multiply-intro/...
2,117374,published,Multiply (Intro),2022-11-21 15:31,0,0,https://py.checkio.org/mission/multiply-intro/...
3,126212,published,Multiply (Intro),,,,
4,Karolina_Zadura,published,Multiply (Intro),2022-11-21 15:39,0,0,https://py.checkio.org/mission/multiply-intro/...
...,...,...,...,...,...,...,...
12147,115128,new,YAML. More Types,,,,
12148,126089,new,YAML. More Types,,,,
12149,AlicjaKraska,new,YAML. More Types,,,,
12150,Mariia_Salganik,new,YAML. More Types,,,,


In [44]:
query3 = "SELECT * FROM entry_df WHERE NOT votes = 'None' AND username = 'Antoni_Wojcik'"
result3 = cursor.execute(query3)
rows3 = result3.fetchall()

pd.DataFrame(rows3, columns=map(lambda x: x[0], result3.description))

Unnamed: 0,username,status,task_name,createdAt,votes,comments,url
0,Antoni_Wojcik,published,Multiply (Intro),2022-11-21 14:02,9,0,https://py.checkio.org/mission/multiply-intro/...
1,Antoni_Wojcik,published,Acceptable Password I,2022-11-22 15:35,4,1,https://py.checkio.org/mission/acceptable-pass...
2,Antoni_Wojcik,published,Is Even,2022-11-21 14:05,15,1,https://py.checkio.org/mission/is-even/publica...
3,Antoni_Wojcik,published,First Word (simplified),2022-11-21 14:05,10,4,https://py.checkio.org/mission/first-word-simp...
4,Antoni_Wojcik,published,Number Length,2022-11-22 15:35,10,1,https://py.checkio.org/mission/number-length/p...
...,...,...,...,...,...,...,...
92,Antoni_Wojcik,published,House Password,2023-03-23 13:19,1,2,https://py.checkio.org/mission/house-password/...
93,Antoni_Wojcik,published,Double Substring,2023-03-23 01:51,0,0,https://py.checkio.org/mission/double-substrin...
94,Antoni_Wojcik,published,Morse Encoder,2023-03-23 01:43,0,0,https://py.checkio.org/mission/morse-encoder/p...
95,Antoni_Wojcik,published,The End of Other,2023-01-14 14:39,0,0,https://py.checkio.org/mission/end-of-other/pu...


****

# Upgrading database (proper design etc.)

In [45]:
query_create_username_table = """
CREATE TABLE users(
  	id INTEGER PRIMARY KEY AUTOINCREMENT,
  	name TEXT NOT NULL
)
"""
cursor.execute(query_create_username_table)

<sqlite3.Cursor at 0x7c49a5140ec0>

In [46]:
# fill users with data from entry_df database
query_fill_users = """
INSERT INTO users (name)
SELECT DISTINCT username
FROM entry_df
"""
cursor.execute(query_fill_users)

<sqlite3.Cursor at 0x7c49a5140ec0>

In [47]:
# Create username id in entry_df
query_add_id_column = """
ALTER TABLE entry_df
ADD COLUMN user_id INTEGER REFERENCES users (id) ON DELETE CASCADE
"""
cursor.execute(query_add_id_column)

<sqlite3.Cursor at 0x7c49a5140ec0>

In [48]:
# Add corresponding id's to entry_df
query_add_ids = """
UPDATE entry_df
SET user_id = (
  SELECT id
  FROM users
  WHERE name = username
)
"""
cursor.execute(query_add_ids)

<sqlite3.Cursor at 0x7c49a5140ec0>

In [53]:
# Drop username column from entry_df as it's no longer needed
query_drop_username = """
ALTER TABLE entry_df
DROP COLUMN username
"""
cursor.execute(query_drop_username)

<sqlite3.Cursor at 0x7c49a5140ec0>

***

In [49]:
# Sanity check -> table creation
queryt = 'SELECT * FROM sqlite_schema'
resultt = cursor.execute(queryt)
rowst = resultt.fetchall()

pd.DataFrame(rowst, columns=map(lambda x: x[0], resultt.description))

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,entry_df,entry_df,2,"CREATE TABLE entry_df (\n\tusername TEXT, \n\t..."
1,table,task_data,task_data,237,"CREATE TABLE task_data (\n\t""Task"" TEXT, \n\t""..."
2,table,users,users,240,CREATE TABLE users(\n \tid INTEGER PRIMARY KE...
3,table,sqlite_sequence,sqlite_sequence,241,"CREATE TABLE sqlite_sequence(name,seq)"


In [50]:
# Sanity check -> names in users table
querytes = 'SELECT * FROM users WHERE id = 7'
resulttes = cursor.execute(querytes)
rowstes = resulttes.fetchall()

pd.DataFrame(rowstes, columns=map(lambda x: x[0], resulttes.description))

Unnamed: 0,id,name
0,7,117370


In [54]:
# Sanity check -> editing entry_df table
querytes = 'SELECT * FROM entry_df LIMIT 5'
resulttes = cursor.execute(querytes)
rowstes = resulttes.fetchall()

pd.DataFrame(rowstes, columns=map(lambda x: x[0], resulttes.description))

Unnamed: 0,status,task_name,createdAt,votes,comments,url,user_id
0,published,Multiply (Intro),,,,,1
1,published,Multiply (Intro),2022-11-21 14:02,9.0,0.0,https://py.checkio.org/mission/multiply-intro/...,2
2,published,Multiply (Intro),2022-11-21 15:31,0.0,0.0,https://py.checkio.org/mission/multiply-intro/...,3
3,published,Multiply (Intro),,,,,4
4,published,Multiply (Intro),2022-11-21 15:39,0.0,0.0,https://py.checkio.org/mission/multiply-intro/...,5


In [55]:
# Sanity check -> joining
querytes = """
SELECT * FROM entry_df
JOIN users
ON user_id = id
WHERE name = "117370"
"""
resulttes = cursor.execute(querytes)
rowstes = resulttes.fetchall()

pd.DataFrame(rowstes, columns=map(lambda x: x[0], resulttes.description))

Unnamed: 0,status,task_name,createdAt,votes,comments,url,user_id,id,name
0,published,Multiply (Intro),2022-11-28 14:57,0,1,https://py.checkio.org/mission/multiply-intro/...,7,7,117370
1,published,Acceptable Password I,2022-11-28 15:02,0,0,https://py.checkio.org/mission/acceptable-pass...,7,7,117370
2,published,Is Even,2022-11-28 15:10,0,0,https://py.checkio.org/mission/is-even/publica...,7,7,117370
3,published,First Word (simplified),2023-01-26 15:57,0,0,https://py.checkio.org/mission/first-word-simp...,7,7,117370
4,published,Number Length,2023-01-26 21:43,0,0,https://py.checkio.org/mission/number-length/p...,7,7,117370
...,...,...,...,...,...,...,...,...,...
243,new,Weak Point,,,,,7,7,117370
244,opened,Working Hours Calculator,,,,,7,7,117370
245,opened,Work Schedule Generator,,,,,7,7,117370
246,new,Xs and Os Champion,,,,,7,7,117370
