In [1]:
import pandas as pd
import json
import datetime

pd.set_option('display.max_rows', 100)

# Подготовка данных

In [2]:
print(datetime.datetime.now())

2022-03-09 01:33:44.904133


## Загрузка данных

In [3]:
with open('data/json/reformatted_li.json', 'r') as f:
    data_reformatted_li_json = json.load(f)
    
with open("data/json/reformatted_nonli_extra.json", "r") as f:
    data_reformatted_nonli_extra = json.load(f)
    
with open("data/json/reformatted_nonli.json", "r") as f:
    data_reformatted_nonli = json.load(f)

### uuid_x_education

In [4]:
def education_json_to_pandas(data_json):

    education_schema = [
        'org_uuid', 
        'education_school_name', 
        'education_field_of_study',
        'education_description',
        'education_degree',
        'education_start_date_year',
        'education_end_date',
        'education_school_link'
    ]

    df_education = pd.DataFrame(columns = education_schema)

    for i, item in enumerate(data_json):

        print('{} / {}'.format(i+1, len(data_json)), end="\r")

        df_education_batch = pd.DataFrame(columns = education_schema)

        for item_education in item['educations']:
            row = [
                item['org_uuid'], 
                item_education['education_school_name'],
                item_education['education_field_of_study'], 
                item_education['education_description'],
                item_education['education_degree'], 
                item_education['education_start_date_year'],
                item_education['education_end_date'], 
                item_education['education_school_link'],
            ]
            df_education_batch = pd.concat(
                [
                    df_education_batch, 
                    pd.DataFrame([row], columns=education_schema)
                ]
            )

        df_education = pd.concat([df_education, df_education_batch])
    
    return df_education

In [5]:
df_education = education_json_to_pandas(data_reformatted_nonli)
df_education.to_csv('data/uuid_x_education_fields.csv')

2028 / 2028

### uuid_x_experiences

In [6]:
def experience_json_to_pandas(data_json):

    experience_schema = [
        'org_uuid', 
        'experience_title', 
        'experience_start_date',
        'experience_end_date',
        'experience_headcount_range',
        'experience_company_name',
        'experience_company_link',
        'experience_company_id_link',
        'experience_industry',
        'experience_location',
        'experience_description'
    ]

    df_experience = pd.DataFrame(columns = experience_schema)

    for i, item in enumerate(data_json):

        print('{} / {}'.format(i+1, len(data_json)), end="\r")

        df_experience_batch = pd.DataFrame(columns = experience_schema)

        for item_experience in item['experiences']:
            row = [
                item['org_uuid'], 
                item_experience['experience_title'],
                item_experience['experience_start_date'], 
                item_experience['experience_end_date'],
                item_experience['experience_headcount_range'], 
                item_experience['experience_company_name'],
                item_experience['experience_company_link'], 
                item_experience['experience_company_id_link'],
                item_experience['experience_industry'],
                item_experience['experience_location'],
                item_experience['experience_description']
            ]
            df_experience_batch = pd.concat(
                [
                    df_experience_batch, 
                    pd.DataFrame([row], columns=experience_schema)
                ]
            )

        df_experience = pd.concat([df_experience, df_experience_batch])
    
    return df_experience

In [7]:
df_experience = experience_json_to_pandas(data_reformatted_nonli)
df_experience.to_csv('data/uuid_x_experience_fields.csv')

2028 / 2028

### uuid_x_skills

In [8]:
def skills_json_to_pandas(data_json):

    skills_schema = [
        'org_uuid', 
        'skill',
    ]

    df_skills = pd.DataFrame(columns = skills_schema)

    for i, item in enumerate(data_json):

        print('{} / {}'.format(i+1, len(data_json)), end="\r")

        df_skills_batch = pd.DataFrame(columns = skills_schema)

        for item_skill in item['skills']:
            row = [
                item['org_uuid'],
                item_skill['skill']
            ]
            df_skills_batch = pd.concat(
                [
                    df_skills_batch, 
                    pd.DataFrame([row], columns=skills_schema)
                ]
            )

        df_skills = pd.concat([df_skills, df_skills_batch])
    
    return df_skills

In [9]:
df_skills = skills_json_to_pandas(data_reformatted_nonli)
df_skills.to_csv('data/uuid_x_skills_fields.csv')

2028 / 2028

## Таблицы

In [10]:
df_education = pd.read_csv('data/uuid_x_education_fields.csv', index_col=0)
df_experience = pd.read_csv('data/uuid_x_experience_fields.csv', index_col=0)
df_skills = pd.read_csv('data/uuid_x_skills_fields.csv', index_col=0)

In [15]:
df_education.head(5)

Unnamed: 0,org_uuid,education_school_name,education_field_of_study,education_description,education_degree,education_start_date_year,education_end_date,education_school_link
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Davidson College,English,,BA,,,https://www.linkedin.com/school/davidson-college/
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Queen Mary University of London,"English, History",Semester study abroad experience.,,,,https://www.linkedin.com/school/queen-mary-uni...
0,b37187f2-8ad4-8225-cfa2-2757f2ef9bc4,Kyungpook National University,"Electrical, Electronics and Communications Eng...",,Bachelor of Science (BS),1985.0,1989.0,https://www.linkedin.com/school/%EA%B2%BD%EB%B...
0,75e76c56-3026-404d-079d-be335a63d9ff,Highline College,,"Undergraduate Studies, One Year \nTechnology S...",,,,https://www.linkedin.com/school/highline-college/
0,75e76c56-3026-404d-079d-be335a63d9ff,Highline Community Collage,,,,,,


In [12]:
df_experience.head(5)

Unnamed: 0,org_uuid,experience_title,experience_start_date,experience_end_date,experience_headcount_range,experience_company_name,experience_company_link,experience_company_id_link,experience_industry,experience_location,experience_description
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Senior Relationship Manager,Feb 2014,Jan 2016,51-200,edo Interactive,https://www.linkedin.com/company/edo-interactive/,https://www.linkedin.com/company/484307,80.0,"Nashville, TN",
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Relationship Manager,Oct 2012,Feb 2014,51-200,edo Interactive,https://www.linkedin.com/company/edo-interactive/,https://www.linkedin.com/company/484307,80.0,"Nashville, TN","Venture-backed start-up, edo helps advertisers..."
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Account Manager,Mar 2010,Jul 2012,51-200,Wray Ward,https://www.linkedin.com/company/wray-ward/,https://www.linkedin.com/company/50792,80.0,,Developed and implemented strategic marketing ...
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Assistant Account Manager,May 2007,Feb 2010,51-200,Wray Ward,https://www.linkedin.com/company/wray-ward/,https://www.linkedin.com/company/50792,80.0,,
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,"Account Coordinator, Broadcast Traffic Manager",Aug 2006,May 2007,51-200,Wray Ward,https://www.linkedin.com/company/wray-ward/,https://www.linkedin.com/company/50792,80.0,,


In [13]:
df_skills.head(5)

Unnamed: 0,org_uuid,skill
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Advertising
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Email Marketing
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Social Media
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Event Planning
0,438c7e89-ba0c-b8cc-7ea6-1a0ac47ae40d,Marketing
