In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
# Load the datasets
companies = pd.read_csv('companies.csv')
company_industries = pd.read_csv('company_industries.csv')
company_specialities = pd.read_csv('company_specialities.csv')
employee_counts = pd.read_csv('employee_counts.csv')
benefits = pd.read_csv('benefits.csv')
job_industries = pd.read_csv('job_industries.csv')
job_skills = pd.read_csv('job_skills.csv')
salaries = pd.read_csv('salaries.csv')
industries = pd.read_csv('industries.csv')
skills = pd.read_csv('skills.csv')
postings = pd.read_csv('postings.csv')

In [6]:

# Display the first few rows and summary information of each dataset to understand their structure and content

print("Companies:\n", companies.head(), "\n")
print(companies.info(), "\n")

print("Company Industries:\n", company_industries.head(), "\n")
print(company_industries.info(), "\n")

print("Company Specialities:\n", company_specialities.head(), "\n")
print(company_specialities.info(), "\n")

print("Employee Counts:\n", employee_counts.head(), "\n")
print(employee_counts.info(), "\n")

print("Benefits:\n", benefits.head(), "\n")
print(benefits.info(), "\n")

print("Job Industries:\n", job_industries.head(), "\n")
print(job_industries.info(), "\n")

print("Job Skills:\n", job_skills.head(), "\n")
print(job_skills.info(), "\n")

print("Salaries:\n", salaries.head(), "\n")
print(salaries.info(), "\n")

print("Industries:\n", industries.head(), "\n")
print(industries.info(), "\n")

print("Skills:\n", skills.head(), "\n")
print(skills.info(), "\n")

print("Postings:\n", postings.head(), "\n")
print(postings.info(), "\n")

Companies:
    company_id                        name  \
0        1009                         IBM   
1        1016               GE HealthCare   
2        1025  Hewlett Packard Enterprise   
3        1028                      Oracle   
4        1033                   Accenture   

                                         description  company_size  state  \
0  At IBM, we do more than work. We create. We cr...           7.0     NY   
1  Every day millions of people feel the impact o...           7.0      0   
2  Official LinkedIn of Hewlett Packard Enterpris...           7.0  Texas   
3  We’re a cloud technology company that provides...           7.0  Texas   
4  Accenture is a leading global professional ser...           7.0      0   

  country              city zip_code                                address  \
0      US  Armonk, New York    10504  International Business Machines Corp.   
1      US           Chicago        0                                      -   
2      US        

In [7]:
# Define a function to handle missing values and duplicates
def clean_dataframe(df, df_name):
    print(f"Cleaning {df_name} DataFrame")
    initial_shape = df.shape
    print(f"Initial shape: {initial_shape}")

    # Handle missing values by dropping rows with any missing values
    df = df.dropna()

    # Handle duplicates
    df = df.drop_duplicates()

    final_shape = df.shape
    print(f"Final shape: {final_shape}\n")
    return df

In [8]:
# Clean all DataFrames
postings = clean_dataframe(postings, 'postings')
companies = clean_dataframe(companies, 'companies')
company_industries = clean_dataframe(company_industries, 'company_industries')
company_specialities = clean_dataframe(company_specialities, 'company_specialities')
employee_counts = clean_dataframe(employee_counts, 'employee_counts')
benefits = clean_dataframe(benefits, 'benefits')
job_industries = clean_dataframe(job_industries, 'job_industries')
job_skills = clean_dataframe(job_skills, 'job_skills')
salaries = clean_dataframe(salaries, 'salaries')
industries = clean_dataframe(industries, 'industries')
skills = clean_dataframe(skills, 'skills')

Cleaning postings DataFrame
Initial shape: (123849, 28)
Final shape: (0, 28)

Cleaning companies DataFrame
Initial shape: (24473, 10)
Final shape: (21522, 10)

Cleaning company_industries DataFrame
Initial shape: (24375, 2)
Final shape: (24375, 2)

Cleaning company_specialities DataFrame
Initial shape: (169387, 2)
Final shape: (169387, 2)

Cleaning employee_counts DataFrame
Initial shape: (35787, 4)
Final shape: (35787, 4)

Cleaning benefits DataFrame
Initial shape: (67943, 3)
Final shape: (67943, 3)

Cleaning job_industries DataFrame
Initial shape: (164808, 2)
Final shape: (164808, 2)

Cleaning job_skills DataFrame
Initial shape: (213768, 2)
Final shape: (213768, 2)

Cleaning salaries DataFrame
Initial shape: (40785, 8)
Final shape: (0, 8)

Cleaning industries DataFrame
Initial shape: (422, 2)
Final shape: (388, 2)

Cleaning skills DataFrame
Initial shape: (35, 2)
Final shape: (35, 2)



In [9]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [18]:
import contractions
import re

def preprocess_text_advanced(text):
    # Expand contractions
    text = contractions.fix(text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lower case
    tokens = [word.lower() for word in tokens]
    # Remove special characters and punctuation
    tokens = [re.sub(r'\W+', '', word) for word in tokens if re.sub(r'\W+', '', word).isalpha()]
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize and stem
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)


In [17]:
pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


In [34]:
# Example: Merging companies with company_industries on 'company_id'
merged_df = pd.merge(companies, company_industries, on='company_id', how='left')

# Further merge with other DataFrames as needed
merged_df = pd.merge(merged_df, company_specialities, on='company_id', how='left')
merged_df = pd.merge(merged_df, employee_counts, on='company_id', how='left')


# Check the merged DataFrame
print("Merged DataFrame:\n", merged_df.head())

Merged DataFrame:
    company_id name                                        description  \
0        1009  IBM  At IBM, we do more than work. We create. We cr...   
1        1009  IBM  At IBM, we do more than work. We create. We cr...   
2        1009  IBM  At IBM, we do more than work. We create. We cr...   
3        1009  IBM  At IBM, we do more than work. We create. We cr...   
4        1009  IBM  At IBM, we do more than work. We create. We cr...   

   company_size state country              city zip_code  \
0           7.0    NY      US  Armonk, New York    10504   
1           7.0    NY      US  Armonk, New York    10504   
2           7.0    NY      US  Armonk, New York    10504   
3           7.0    NY      US  Armonk, New York    10504   
4           7.0    NY      US  Armonk, New York    10504   

                                 address  \
0  International Business Machines Corp.   
1  International Business Machines Corp.   
2  International Business Machines Corp.   
3  In