In [42]:
import pandas as pd

# Load the datasets
companies = pd.read_csv('companies.csv')
company_industries = pd.read_csv('company_industries.csv')
company_specialities = pd.read_csv('company_specialities.csv')
employee_counts = pd.read_csv('employee_counts.csv')
benefits = pd.read_csv('benefits.csv')
job_industries = pd.read_csv('job_industries.csv')
job_skills = pd.read_csv('job_skills.csv')
salaries = pd.read_csv('salaries.csv')
industries = pd.read_csv('industries.csv')
skills = pd.read_csv('skills.csv')
postings = pd.read_csv('postings.csv')


In [43]:
# Display the first few rows and summary of each dataframe
dataframes = [companies, company_industries, company_specialities, employee_counts, benefits, job_industries, job_skills, salaries, industries, skills, postings]
names = ['companies', 'company_industries', 'company_specialities', 'employee_counts', 'benefits', 'job_industries', 'job_skills', 'salaries', 'industries', 'skills', 'postings']

for name, df in zip(names, dataframes):
    print(f"Dataset: {name}")
    print(df.head(), '\n')
    print(df.info(), '\n')
    print(df.describe(include='all'), '\n')
    print('-' * 80)


Dataset: companies
   company_id                        name  \
0        1009                         IBM   
1        1016               GE HealthCare   
2        1025  Hewlett Packard Enterprise   
3        1028                      Oracle   
4        1033                   Accenture   

                                         description  company_size  state  \
0  At IBM, we do more than work. We create. We cr...           7.0     NY   
1  Every day millions of people feel the impact o...           7.0      0   
2  Official LinkedIn of Hewlett Packard Enterpris...           7.0  Texas   
3  We’re a cloud technology company that provides...           7.0  Texas   
4  Accenture is a leading global professional ser...           7.0      0   

  country              city zip_code                                address  \
0      US  Armonk, New York    10504  International Business Machines Corp.   
1      US           Chicago        0                                      -   
2      US 

In [44]:
def preprocess_companies(df):
    # Drop rows with missing company_id as it's a primary key
    df.dropna(subset=['company_id'], inplace=True)
    
    # Fill missing values
    df['name'].fillna('Unknown', inplace=True)
    df['description'].fillna('No description available', inplace=True)
    df['company_size'].fillna(df['company_size'].median(), inplace=True)
    df['state'].fillna('Unknown', inplace=True)
    df['zip_code'].fillna('Unknown', inplace=True)
    df['address'].fillna('No address available', inplace=True)
    
    return df

companies = preprocess_companies(companies)

print("Companies:")
print(companies.head())

Companies:
   company_id                        name  \
0        1009                         IBM   
1        1016               GE HealthCare   
2        1025  Hewlett Packard Enterprise   
3        1028                      Oracle   
4        1033                   Accenture   

                                         description  company_size  state  \
0  At IBM, we do more than work. We create. We cr...           7.0     NY   
1  Every day millions of people feel the impact o...           7.0      0   
2  Official LinkedIn of Hewlett Packard Enterpris...           7.0  Texas   
3  We’re a cloud technology company that provides...           7.0  Texas   
4  Accenture is a leading global professional ser...           7.0      0   

  country              city zip_code                                address  \
0      US  Armonk, New York    10504  International Business Machines Corp.   
1      US           Chicago        0                                      -   
2      US         

In [45]:
def preprocess_company_industries(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

company_industries = preprocess_company_industries(company_industries)
print(df)

print("\nCompany Industries:")
print(company_industries.head())

            job_id                     company_name  \
0           921716            Corcoran Sawyer Smith   
1          1829192                              NaN   
2         10998357           The National Exemplar    
3         23221523           Abrams Fensterman, LLP   
4         35982263                              NaN   
...            ...                              ...   
123844  3906267117                     Lozano Smith   
123845  3906267126                        Pinterest   
123846  3906267131                     EPS Learning   
123847  3906267195  Trelleborg Applied Technologies   
123848  3906267224                        Solugenix   

                                                    title  \
0                                   Marketing Coordinator   
1                       Mental Health Therapist/Counselor   
2                             Assitant Restaurant Manager   
3       Senior Elder Law / Trusts and Estates Associat...   
4                                 

In [46]:
def preprocess_company_specialities(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

company_specialities = preprocess_company_specialities(company_specialities)

print("\nCompany Specialities:")
print(company_specialities.head())


Company Specialities:
   company_id              speciality
0    22292832      window replacement
1    22292832  patio door replacement
2       20300      Commercial Banking
3       20300          Retail Banking
4       20300                Mortgage


In [47]:
def preprocess_employee_counts(df):
    # Convert time_recorded to datetime
    df['time_recorded'] = pd.to_datetime(df['time_recorded'], unit='s')
    
    return df

employee_counts = preprocess_employee_counts(employee_counts)

print("\nEmployee Counts:")
print(employee_counts.head())


Employee Counts:
   company_id  employee_count  follower_count       time_recorded
0      391906             186           32508 2024-04-05 19:42:53
1    22292832             311            4471 2024-04-05 19:42:53
2       20300            1053            6554 2024-04-05 19:42:53
3     3570660             383           35241 2024-04-05 19:42:53
4      878353              52           26397 2024-04-05 19:42:53


In [48]:
def preprocess_benefits(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

benefits = preprocess_benefits(benefits)

print("\nBenefits:")
print(benefits.head())


Benefits:
       job_id  inferred                     type
0  3887473071         0        Medical insurance
1  3887473071         0         Vision insurance
2  3887473071         0         Dental insurance
3  3887473071         0                   401(k)
4  3887473071         0  Student loan assistance


In [49]:
def preprocess_job_industries(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

job_industries = preprocess_job_industries(job_industries)


print("\nJob Industries:")
print(job_industries.head())


Job Industries:
       job_id  industry_id
0  3884428798           82
1  3887473071           48
2  3887465684           41
3  3887467939           82
4  3887467939           80


In [50]:
def preprocess_job_skills(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

job_skills = preprocess_job_skills(job_skills)

print("\nJob Skills:")
print(job_skills.head())



Job Skills:
       job_id skill_abr
0  3884428798      MRKT
1  3884428798        PR
2  3884428798       WRT
3  3887473071      SALE
4  3887465684       FIN


In [51]:
def preprocess_salaries(df):
    # Handle missing values
    df['max_salary'].fillna(df['max_salary'].median(), inplace=True)
    df['med_salary'].fillna(df['med_salary'].median(), inplace=True)
    df['min_salary'].fillna(df['min_salary'].median(), inplace=True)
    
    return df

salaries = preprocess_salaries(salaries)

print("\nSalaries:")
print(salaries.head())



Salaries:
   salary_id      job_id  max_salary  med_salary  min_salary pay_period  \
0          1  3884428798     85000.0        20.0     62300.0     HOURLY   
1          2  3887470552        25.0        25.0        23.0     HOURLY   
2          3  3884431523    120000.0        25.0    100000.0     YEARLY   
3          4  3884911725    200000.0        25.0     10000.0     YEARLY   
4          5  3887473220        35.0        25.0        33.0     HOURLY   

  currency compensation_type  
0      USD       BASE_SALARY  
1      USD       BASE_SALARY  
2      USD       BASE_SALARY  
3      USD       BASE_SALARY  
4      USD       BASE_SALARY  


In [52]:
def preprocess_industries(df):
    # Drop rows with missing industry_name
    df.dropna(subset=['industry_name'], inplace=True)
    
    # Fill missing values
    df['industry_name'].fillna('Unknown', inplace=True)
    
    return df

industries = preprocess_industries(industries)

print("\nIndustries:")
print(industries.head())

print("\nSkills:")
print(skills.head())


Industries:
   industry_id                         industry_name
0            1       Defense and Space Manufacturing
1            3       Computer Hardware Manufacturing
2            4                  Software Development
3            5          Computer Networking Products
4            6  Technology, Information and Internet

Skills:
  skill_abr          skill_name
0       ART        Art/Creative
1      DSGN              Design
2      ADVR         Advertising
3      PRDM  Product Management
4      DIST        Distribution


In [53]:
def preprocess_skills(df):
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    
    return df

skills = preprocess_skills(skills)

print("\nPostings:")
print(postings.head())



Postings:
     job_id            company_name  \
0    921716   Corcoran Sawyer Smith   
1   1829192                     NaN   
2  10998357  The National Exemplar    
3  23221523  Abrams Fensterman, LLP   
4  35982263                     NaN   

                                               title  \
0                              Marketing Coordinator   
1                  Mental Health Therapist/Counselor   
2                        Assitant Restaurant Manager   
3  Senior Elder Law / Trusts and Estates Associat...   
4                                 Service Technician   

                                         description  max_salary pay_period  \
0  Job descriptionA leading real estate firm in N...        20.0     HOURLY   
1  At Aspen Therapy and Wellness , we are committ...        50.0     HOURLY   
2  The National Exemplar is accepting application...     65000.0     YEARLY   
3  Senior Associate Attorney - Elder Law / Trusts...    175000.0     YEARLY   
4  Looking for HVAC se

In [55]:
# Merge companies with company_industries
merged_df = companies.merge(company_industries, on='company_id', how='left')

# Merge with company_specialities
merged_df = merged_df.merge(company_specialities, on='company_id', how='left')

print (merged_df)


        company_id                               name  \
0             1009                                IBM   
1             1009                                IBM   
2             1009                                IBM   
3             1009                                IBM   
4             1009                                IBM   
...            ...                                ...   
176205   103463217                       JRC Services   
176206   103466352             Centent Consulting LLC   
176207   103467540  Kings and Queens Productions, LLC   
176208   103468936                           WebUnite   
176209   103472979                            BlackVe   

                                              description  company_size  \
0       At IBM, we do more than work. We create. We cr...           7.0   
1       At IBM, we do more than work. We create. We cr...           7.0   
2       At IBM, we do more than work. We create. We cr...           7.0   
3       At IBM,