# Description
This notebook can be used to read and clean up the SQL-dump of "Spaces" that has been provided by Fernando, making it ready to be used in the pipeline. 

# Setup
To run this notebook you'll need a mariadb server on which the sql-file has been imported into a new database. Change the settings to connect to the host in the next section. 

In [3]:
import warnings
import pandas as pd
import mysql.connector
import json

# disable warnings
warnings.filterwarnings('ignore')

# connect to database
conn = mysql.connector.connect(user='user', password='1234',
                                host='127.0.0.1',
                                database='spaces')

In [4]:

# get data for articles
query = "SELECT * FROM spaces"
df_spaces = pd.read_sql(query, conn)
query = "SELECT * FROM posts"
df_posts = pd.read_sql(query, conn)

# get data for Spaces
query = "SELECT * FROM pages"
df_pages = pd.read_sql(query, conn)
query = "SELECT * FROM tags"
df_tags = pd.read_sql(query, conn)
query = "SELECT * FROM taggables"
df_taggables = pd.read_sql(query, conn)

# get data for users
query = "SELECT * FROM users"
df_users = pd.read_sql(query, conn)
query = "SELECT * FROM space_user"
df_space_user = pd.read_sql(query, conn)

# get data for learning materials
query = "SELECT * FROM books"
df_books = pd.read_sql(query, conn)
query = "SELECT * FROM files"
df_files = pd.read_sql(query, conn)

# close connection
conn.close()

In [5]:
df_spaces.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               178 non-null    int64         
 1   nature           178 non-null    object        
 2   name             178 non-null    object        
 3   abbreviation     178 non-null    object        
 4   slug             178 non-null    object        
 5   description      155 non-null    object        
 6   color_scheme_id  178 non-null    int64         
 7   created_at       178 non-null    datetime64[ns]
 8   updated_at       178 non-null    datetime64[ns]
 9   deleted_at       3 non-null      datetime64[ns]
 10  settings         128 non-null    object        
dtypes: datetime64[ns](3), int64(2), object(6)
memory usage: 15.4+ KB


In [6]:
# merge spaces and pages on space_id
merged_df = pd.merge(df_spaces, df_pages, left_on='id', right_on='space_id')

# pivot the merged dataframe to create new columns for portrait and sidebar
pivoted_df = merged_df.pivot(index='space_id', columns='slug_y', values='content')

# rename the columns to match the desired column names
pivoted_df = pivoted_df.rename(columns={'portrait': 'portrait_content', 'sidebar': 'sidebar_content', 'exercise': 'exercise_content'})

# merge the pivoted dataframe back to the original df_spaces dataframe
df_spaces = pd.merge(df_spaces, pivoted_df, left_on='id', right_on='space_id')

# merge tags and taggables on tag_id
merged_df = pd.merge(df_tags, df_taggables, left_on='id', right_on='tag_id')

# collapse to one row per taggable_id, keeüing all tags in a dictionary
merged_df = merged_df.groupby('taggable_id').agg({'name': lambda x: list(x)}).reset_index()

# merge the merged_df with df_spaces on id and taggable_id and drop the taggable_id column, also rename the name column to tags
df_spaces = pd.merge(df_spaces, merged_df, left_on='id', right_on='taggable_id').drop(['taggable_id', 'color_scheme_id', 'created_at', 'updated_at', 'deleted_at'], axis=1).rename(columns={'name_x': 'name', 'name_y': 'tags'})

In [7]:
# convert column settings to string
df_spaces['settings'] = df_spaces['settings'].astype(str)

# extract the dictionary from the string
df_spaces['settings'] = df_spaces['settings'].str.extract(r"(\{.*\})")

# convert settings dictionary string to columns and values
df_spaces = df_spaces.join(df_spaces['settings'].str.strip('{}').str.split(', ', expand=True).add_prefix('setting_'))

# drop the settings column
df_spaces = df_spaces.drop('settings', axis=1)

# rename setting_0 to ects and extract only the number
df_spaces = df_spaces.rename(columns={'setting_0': 'ects'})
df_spaces['ects'] = df_spaces['ects'].str.extract(r"(\d+)")

# rename setting_1 to semester and extract only the string after the colon
df_spaces = df_spaces.rename(columns={'setting_1': 'semester'})
df_spaces['semester'] = df_spaces['semester'].str.extract(r':(.*)')
# remove the "s from the semester column
df_spaces['semester'] = df_spaces['semester'].str.replace('"', '')

In [10]:
# define a function to extract the values from the dictionaries in the tags column
def extract_tags(tags):
    tag_values = []
    for tag in tags:
        # convert the bytes object to a string and remove the b' prefix and ' suffix
        tag_str = tag
        # load the string as a JSON object
        tag_dict = json.loads(tag_str)
        # append the value of the dictionary to the list of tag values
        tag_values.append(list(tag_dict.values())[0])
    return tag_values

# apply the function to the tags column and create a new column with the extracted values
df_spaces['tag_values'] = df_spaces['tags'].apply(extract_tags)

# drop the tags column
df_spaces = df_spaces.drop('tags', axis=1)

In [11]:
df_spaces.head()

Unnamed: 0,id,nature,name,abbreviation,slug,description,exercise_content,portrait_content,sidebar_content,ects,semester,tag_values
0,1,learning,Digital kommunizieren,dko,digital-kommunizieren,"#dko Informationen, Ankündigungen, Antworten a...","<p>Im Trainingscenter findest Du Aufgaben, die...",<h2>Wegleitung</h2><p>Wie oft hast du in deine...,"<table><tbody><tr><th colspan=""1"" rowspan=""1"">...",2.0,,"[Kommunikation, Portfolio, Basis, Deutsch, Eng..."
1,2,exchanging,DS Spaces 🚀,,ds-spaces,Die einzigartige Lernumgebung des Studiengange...,,<h2>Warum ein neues Spaces und what’s next?</h...,,,,"[Deutsch, Feedback]"
2,3,exchanging,FachexpertInnen Lounge,,fachexpertinnen-lounge,Dies ist eine geschlossene Gruppe für die Fach...,,<h2>Studiengangsdokumente</h2><p><strong>Ausbi...,<p>Nächste Austausch Termine:</p><p><strong>Ha...,,,[Lounge]
3,4,exchanging,Einführungswoche 🗓,,einfuehrungswoche,Vor dem offiziellen Herbstsemesterstart steht ...,,<h2>Was erwartet euch?</h2><p>Die Einführungsw...,<p><strong>Einführungstage Campus:</strong><br...,,,[Anlass]
4,5,practicing,Steinschlagrisiko,cwm1,steinschlagrisiko,1Da - Mit dieser Challenge tragen Sie zur Sich...,,<h2>Aufgabenstellung</h2><p>Die Kantonsstrasse...,"<table><tbody><tr><td colspan=""1"" rowspan=""1"">...",4.0,HS23,"[Basis, Deutsch, Englisch, Challenge]"


In [15]:
# save dataframe to parquet file
df_spaces.to_parquet('spaces.parquet')