In [None]:
# Create mapping for values
mapping = {'old_value1': 'new_value1', 'old_value2': 'new_value2'}
df['column_name'] = df['column_name'].map(mapping)

In [None]:
# Rename simple attributes
df = df.rename(columns={
    'public_identifier': 'linkedInPubId',
    'profile_pic_url': 'profilePictureUrl',
    'background_cover_image_url': 'backgroundPictureUrl',
    'first_name': 'firstName',
    'last_name': 'lastName',
    'profile_headline': 'profileHeadline',
    'summary': 'profileText',
    'location': 'idLocation',
    'languages': 'idLanguages',
    'connections': 'connections',
    'github': 'github',
})

def map_simple_data(json_object):
    profilePictureUrl = 
    backgroundPictureUrl = 
    firstName = json_object.get('firstName')
    lastName = json_object.get('lastName')
    occupation = json_object.get('occupation')
    profileHeadline = json_object.get('profileHeadline')
    profileText = json_object.get('profileText')
    idLocation = json_object.get('idLocation')
    idLanguages = json_object.get('idLanguages')
    connections = json_object.get('connections')
    github = json_object.get('github')
    facebook = json_object.get('facebook')
    gender = json_object.get('gender')
    birthDate = json_object.get('birthDate')
    industry = json_object.get('industry')

    # Return dictionary of mapped values
    return {
        'linkedInPubId': json_object.get('public_identifier'),
        'profilePictureUrl': json_object.get('profile_pic_url'),
        'backgroundPictureUrl': json_object.get('background_cover_image_url'),
        'firstName': firstName,
        'lastName': lastName,
        'occupation': occupation,
        'profileHeadline': profileHeadline,
        'profileText': profileText,
        'idLocation': idLocation,
        'idLanguages': idLanguages,
        'connections': connections,
        'github': github,
        'facebook': facebook,
        'gender': gender,
        'birthDate': birthDate,
        'industry': industry
    }

In [17]:
# Imports
import os
import dotenv
from sqlalchemy import create_engine
import pandas as pd


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())
hostname = os.getenv('DATABASE_HOST')
user = os.getenv('DATABASE_USER')
password = os.getenv('DATABASE_PASSWORD')
database = os.getenv('DATABASE_NAME') 

# Set up database connection using SSL
db_engine = create_engine(f'mysql+pymysql://{user}:{password}@{hostname}/{database}', 
                       connect_args={
                           'ssl_ca': os.getenv('SSL_CA'),
                           'ssl_cert': os.getenv('SSL_CLIENT_CERT'),
                           'ssl_key': os.getenv('SSL_KEY')
                       })

"""
!!!UNENCRYPTED CONNECTION!!!
For use within local network only!
db_engine = create_engine(f'mysql+pymysql://{user}:{password}@{hostname}/{database}') 
!!!UNENCRYPTED CONNECTION!!!
"""

ModuleNotFoundError: No module named 'pymysql'

In [27]:
# Function to check data types and max lengths of values
def check_data(dataset):
    for column in dataset.columns:
        print(f'Column: {column}')
        print(f'Data type: {dataset[column].dtype}')
        try:
            print(f'Max length: {dataset[column].str.len().max()}')
        except:
            print('No string data ')
        print('---\n')

In [28]:
import os
import pandas as pd

path = os.getenv('DATASET_PATH')

# Read JSONs from file
df = pd.read_json(path, lines=True)

check_data(df)

Column: public_identifier
Data type: object
Max length: 60
---

Column: profile_pic_url
Data type: object
Max length: 176
---

Column: background_cover_image_url
Data type: object
Max length: 186
---

Column: first_name
Data type: object
Max length: 63
---

Column: last_name
Data type: object
Max length: 32.0
---

Column: full_name
Data type: object
Max length: 71
---

Column: occupation
Data type: object
Max length: 135.0
---

Column: headline
Data type: object
Max length: 220.0
---

Column: summary
Data type: object
Max length: 2600.0
---

Column: country
Data type: object
Max length: 2
---

Column: country_full_name
Data type: object
Max length: 43
---

Column: city
Data type: object
Max length: 42.0
---

Column: state
Data type: object
Max length: 43.0
---

Column: experiences
Data type: object
Max length: 84
---

Column: education
Data type: object
Max length: 30
---

Column: languages
Data type: object
Max length: 9
---

Column: accomplishment_organisations
Data type: object
Max 

In [None]:
# Function to check if language already exists
def check_and_insert_language(languages_data_entry, db_con):
    # Define query to find existing record
    query = """
        SELECT id
        FROM DIM_Languages
        WHERE listOfLanguages = %(listOfLanguages)s
    """
    
    # Query database for existing record
    query_result = pd.read_sql_query(query, db_con, params=languages_data_entry)

    # Check if a record was found or insert a new record
    if not query_result.empty:
        # Record found -> use its id
        entry_id = query_result.iloc[0]['id']
    else:
        # No record found -> insert new record and query its id
        languages_df = pd.DataFrame([languages_data_entry])
        languages_df.to_sql('DIM_Languages', con=db_con, if_exists='append', index=False)
        entry_id = pd.read_sql_query("SELECT LAST_INSERT_ID()", db_con).iloc[0, 0]
    
    # Return ID
    return entry_id

# Function to check if location already exists
def check_and_insert_location(location_data_entry, db_con):
    # Define query to find existing record
    query = """
        SELECT id
        FROM DIM_Location
        WHERE countryName = %(countryName)s
        AND stateName = %(stateName)s
        AND cityName = %(cityName)s
    """
    
    # Query database for existing record
    query_result = pd.read_sql_query(query, db_con, params=location_data_entry)

    # Check if a record was found or insert a new record
    if not query_result.empty:
        # Record found -> use its id
        entry_id = query_result.iloc[0]['id']
    else:
        # No record found -> insert new record and query its id
        location_df = pd.DataFrame([location_data_entry])
        location_df.to_sql('DIM_Location', con=db_con, if_exists='append', index=False)
        entry_id = pd.read_sql_query("SELECT LAST_INSERT_ID()", db_con).iloc[0, 0]
    
    # Return ID
    return entry_id

In [None]:
# Function to insert data origin and return its ID
def insert_data_origin(data_origin_name, data_origin_url, data_origin_comment):
    df_origin = pd.DataFrame([{
        'name': data_origin_name, 
        'url': data_origin_url, 
        'importDate': pd.Timestamp.now(), 
        'comment': data_origin_comment}])

    # Insert data into database
    df_origin.to_sql('DIM_DataOrigin', con=db_engine, if_exists='append', index=False)
    
    # Get ID of inserted data origin
    id_origin = pd.read_sql_query("SELECT LAST_INSERT_ID()", db_engine).iloc[0, 0]
    
    # Return ID of inserted data origin
    return id_origin

In [None]:
# Function to process person facts + dimensions and insert them into the database
def process_json(json_object, dimension_key_origin, db_con):
    """
    Extract data from LinkedIn profiles JSON and prepare it for insertion into DWH.

    Missing attributes will be added as comments, 
    complex attributes will be receiving their own tables and be set to id None for now.
    """
    
    # Extract data for Person table
    person_data = {
        'idOrigin': dimension_key_origin,
        'linkedInPubId': json_object.get('public_identifier'),
        'profilePictureUrl': json_object.get('profile_pic_url'),
        'backgroundPictureUrl': json_object.get('background_cover_image_url'),
        'firstName': json_object.get('first_Name'),
        'lastName': json_object.get('last_Name'),
        # Full name (skipped due to name + last name being present)
        'occupation': json_object.get('occupation'),
        'profileHeadline': json_object.get('headline'),
        'profileText': json_object.get('summary'),
        'idLocation': None,
        # Experiences
        # Education
        'idLanguages': None,
        # Accomplishments (organisations, publications, honor awards, patents, courses, projects, test scores)
        # Volunteer work
        # Certifications
        # Connections (included but no need to rename)
        # Activities
        # Similar named profiles
        # Articles
        # Groups
        # Skills
        # Infrared salary
        # Github (included but no need to rename)
        # Facebook (included but no need to rename)
        # Gender (will be converted to enum M/F)
        'birthDate': json_object.get('birth_date'),
        # Industry
        # Interests
    }

    # Extract data for Location table
    location_data = {
        'countryName': json_object.get('country_full_name'),
        'countryLetters': json_object.get('country'),
        'stateName': json_object.get('state'),
        'cityName': json_object.get('city')
    }
    
    # Extract data for Languages table
    languages_data = {
        'sumOfSpoken': len(json_object.get('languages', [])),
        'listOfLanguages': ', '.join(json_object.get('languages', []))
    }
    
    # Create DataFrames
    person_df = pd.DataFrame([person_data])
    languages_df = pd.DataFrame([languages_data])
    location_df = pd.DataFrame([location_data])
    
    # Insert languages and add ID to person
    languages_df.to_sql('DIM_Languages', con=db_con, if_exists='append', index=False)
    dimension_key_languages = pd.read_sql_query("SELECT LAST_INSERT_ID()", db_con).iloc[0, 0]
    person_df['idLanguages'] = dimension_key_languages
    
    # Insert location and add ID to person
    location_df.to_sql('DIM_Location', con=db_con, if_exists='append', index=False)
    dimension_key_location = pd.read_sql_query("SELECT LAST_INSERT_ID()", db_con).iloc[0, 0]
    person_df['idLocation'] = dimension_key_location
    
    # Insert person data and get ID
    person_df.to_sql('FACT_Person', con=db_con, if_exists='append', index=False)
    pk_person = pd.read_sql_query("SELECT LAST_INSERT_ID()", db_con).iloc[0, 0]
    
    # Return primary key of inserted person
    return pk_person

In [None]:
    
    # Extract data for Qualification table
    qualification_data = []
    for qualification in json_object.get('qualifications', []):
        qualification_data.append({
            'idOrigin': json_object.get('idOrigin'),
            'idPerson': None,  # Placeholder for idPerson
            'idDuration': None,  # Placeholder for idDuration
            'type': qualification.get('type'),
            'name': qualification.get('name'),
            'idInstitution': None,  # Placeholder for idInstitution
            'description': qualification.get('description')
        })



    qualification_df = pd.DataFrame(qualification_data)

    # Insert data into the database
    engine = create_engine('your_database_url')







    # Insert data into FACT_Qualification table
    qualification_df.to_sql('FACT_Qualification', con=engine, if_exists='append', index=False)