In [9]:
# Data Science Lifecycle Pipeline

# Import Necessary Libraries
import warnings
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from imblearn.combine import SMOTETomek
from tqdm import tqdm
from langchain_groq import ChatGroq
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import re
from imblearn.under_sampling import RandomUnderSampler
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import json


nltk.download('wordnet')
nltk.download('stopwords')

warnings.filterwarnings("ignore")

# FUTURISTIC PLOT STYLES - LOADED AT THE BEGINNING
plt.style.use('dark_background')

# Set up LLM with ChatGroq
llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0, 
    max_tokens=None,
    timeout=None,
    max_retries=2
)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gerri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Step 1.2: Load Data Files
# Loading Data Paths
it_job_desc_path = '../data/json/IT Job Desc Annotated Detailed.json'
salaries_path = '../data/json/salaries.json'

# Step 1.3: Load Data
with open(it_job_desc_path, 'r') as file:
    it_job_desc_data = json.load(file)

# Extract relevant information from JSON with progress tracking
records = []
for item in tqdm(it_job_desc_data['annotations'], desc="Processing Job Descriptions"):
    text = item[0].strip()
    entities = item[1]['entities']
    if text and text != '\r':  # Filter out empty or meaningless text
        records.append({'text': text, 'entities': entities})

# Convert JSON data to DataFrame
it_job_desc = pd.DataFrame(records)

# Remove job descriptions with no entities
it_job_desc = it_job_desc[it_job_desc['entities'].apply(lambda x: len(x) > 0)]

# Load salaries data
salaries_data = pd.read_json(salaries_path)

Processing Job Descriptions: 100%|██████████| 334/334 [00:00<?, ?it/s]


In [12]:
# PART 2: Data Cleaning and Exploratory Data Analysis (EDA)
# Step 2.1: Data Inspection
# Checking for Missing Values and Basic Data Info
print("IT Job Description Data Info:")
it_job_desc.info()
print("\nSalaries Data Info:")
salaries_data.info()

IT Job Description Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 5 to 217
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      120 non-null    object
 1   entities  120 non-null    object
dtypes: object(2)
memory usage: 2.8+ KB

Salaries Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8805 entries, 0 to 8804
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           8805 non-null   int64 
 1   experience_level    8805 non-null   object
 2   employment_type     8805 non-null   object
 3   job_title           8805 non-null   object
 4   salary              8805 non-null   int64 
 5   salary_currency     8805 non-null   object
 6   salary_in_usd       8805 non-null   int64 
 7   employee_residence  8805 non-null   object
 8   remote_ratio        8805 non-null   int64 
 9   company_location  

In [13]:
salaries_data["job_title"].nunique()

124

In [14]:
# Step 2.2: Displaying Sample Rows
print("\nSample Rows from IT Job Descriptions:")
print(it_job_desc.head())
print("\nSample Rows from Salaries Data:")
print(salaries_data.head())


Sample Rows from IT Job Descriptions:
                                                 text  \
5   Uplimit is seeking a talented Analytics Engine...   
7   Collaborate with cross-functional teams to def...   
8   Design, develop, and maintain data pipelines, ...   
10  Implement data visualization tools to communic...   
12  Stay current on industry trends and best pract...   

                                             entities  
5   [[30, 48, JOB POSITION], [81, 99, JOB POSITION...  
7         [[43, 72, IT SKILLS], [77, 106, IT SKILLS]]  
8   [[30, 44, IT SKILLS], [46, 59, IT SKILLS], [65...  
10                              [[10, 28, IT SKILLS]]  
12                              [[54, 68, IT SKILLS]]  

Sample Rows from Salaries Data:
   work_year experience_level employment_type                       job_title  \
0       2023               EX              FT           Data Science Director   
1       2023               EX              FT           Data Science Director   
2     

In [15]:
# Step 2.3: Handling Missing Values
# Dropping rows with missing values for simplicity (can be handled differently based on requirements)
it_job_desc_cleaned = it_job_desc.dropna()
salaries_data_cleaned = salaries_data.dropna()

# Explanation: We drop missing values for simplicity; further handling can include imputing or flagging.

In [16]:
# Step 2.4: Summary Statistics
print("\nSummary Statistics for IT Job Descriptions:")
print(it_job_desc_cleaned.describe(include='all'))
print("\nSummary Statistics for Salaries Data:")
print(salaries_data_cleaned.describe(include='all'))


Summary Statistics for IT Job Descriptions:
                                                     text  \
count                                                 120   
unique                                                119   
top     Bachelor's degree in Computer Science, Enginee...   
freq                                                    2   

                    entities  
count                    120  
unique                   116  
top     [[0, 71, EDUCATION]]  
freq                       2  

Summary Statistics for Salaries Data:
          work_year experience_level employment_type      job_title  \
count   8805.000000             8805            8805           8805   
unique          NaN                4               4            124   
top             NaN               SE              FT  Data Engineer   
freq            NaN             6336            8762           2062   
mean    2022.737422              NaN             NaN            NaN   
std        0.542484          

The text column contains some repeated content like "Responsibilities:" that appears multiple times.
The entities column often contains empty lists ([]), indicating that not all job descriptions have annotated entities. This could affect downstream analysis, particularly the extraction of skills and qualifications.

There are 8805 entries in the salaries dataset, which is quite large. Some summary statistics like mean and median are already provided.
The column salary_in_usd has already been scaled using StandardScaler, resulting in values centered around zero.
The majority of jobs are in the United States (company_location shows a top value of 'US'), and the most frequent job title is "Data Engineer."

In [17]:
# PART 3: Enhance Data Processing - Extracting Skills and Qualifications from Entities
# Step 3.1: Analyze Entities to Understand Their Structure
print("\nSample Entities from Job Descriptions:")
print(it_job_desc_cleaned['entities'].head(10))

# Explanation: This helps us understand the content and structure of the entities to adjust the extraction logic accordingly.


Sample Entities from Job Descriptions:
5     [[30, 48, JOB POSITION], [81, 99, JOB POSITION...
7           [[43, 72, IT SKILLS], [77, 106, IT SKILLS]]
8     [[30, 44, IT SKILLS], [46, 59, IT SKILLS], [65...
10                                [[10, 28, IT SKILLS]]
12                                [[54, 68, IT SKILLS]]
14                                 [[0, 71, EDUCATION]]
15           [[19, 30, IT SKILLS], [32, 53, IT SKILLS]]
16    [[15, 18, IT LANGUAGES], [20, 23, IT LANGUAGES...
17                                [[51, 66, IT SKILLS]]
18    [[13, 26, IT SKILLS], [28, 41, IT SKILLS], [71...
Name: entities, dtype: object


In [18]:
# Step 3.2: Extracting Skills and Qualifications from Entity Labels
# Extracting entities into a separate DataFrame for better analysis
skills = []
qualifications = []
other_entities = []

for idx, row in tqdm(it_job_desc_cleaned.iterrows(), total=it_job_desc_cleaned.shape[0], desc="Extracting Skills and Qualifications"):
    text = row['text']
    entities = row['entities']
    if entities:  # Only process if entities list is not empty
        for entity in entities:
            if isinstance(entity, list) and len(entity) == 3:
                # Extract content if entity contains three elements: start, end, and label
                start_idx, end_idx, label = entity
                extracted_text = text[start_idx:end_idx].strip()
                label = label.strip().upper()
                if label in ['IT SKILLS', 'TECHNOLOGY', 'SOFT SKILLS', 'IT LANGUAGES']:
                    skills.append(extracted_text)
                elif label in ['QUALIFICATION', 'EDUCATION', 'CERTIFICATION', 'DEGREE']:
                    qualifications.append(extracted_text)
                else:
                    other_entities.append(f"{label}: {extracted_text}")

# Explanation: Extract actual content using start and end indices to get precise sections of the text, based on labeled entity types.
# Additionally, added a condition to ensure only non-empty entities are processed.

Extracting Skills and Qualifications: 100%|██████████| 120/120 [00:00<00:00, 13317.36it/s]


In [19]:
# Step 3.3: Creating DataFrames for Skills, Qualifications, and Other Entities
skills_df = pd.DataFrame(Counter(skills).most_common(), columns=['Skill', 'Frequency'])
qualifications_df = pd.DataFrame(Counter(qualifications).most_common(), columns=['Qualification', 'Frequency'])
other_entities_df = pd.DataFrame(Counter(other_entities).most_common(), columns=['Entity', 'Frequency'])

NameError: name 'Counter' is not defined

In [None]:
# Step 3.4: Displaying Most Common Skills, Qualifications, and Other Entities
print("\nMost Common Skills:")
print(skills_df.head())
print("\nMost Common Qualifications:")
print(qualifications_df.head())
print("\nOther Entities Found:")
print(other_entities_df.head())

In [None]:
# Step 3.5: Visualizing Most Common Skills
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Skill', data=skills_df.head(10), palette='viridis')
plt.title('Top 10 Most Common Skills')
plt.xlabel('Frequency')
plt.ylabel('Skill')
plt.show()

In [None]:
# Step 3.6: Visualizing Most Common Qualifications
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Qualification', data=qualifications_df.head(10), palette='viridis')
plt.title('Top 10 Most Common Qualifications')
plt.xlabel('Frequency')
plt.ylabel('Qualification')
plt.show()

In [None]:
# PART 4: Preprocessing Data for Analysis
# Step 4.1: Preprocessing Data for Analysis using TF-IDF
print("\nApplying TF-IDF Vectorization...")
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
x_tfidf = vectorizer.fit_transform(tqdm(it_job_desc_cleaned['text'], desc="TF-IDF Vectorization"))

# Align salaries data and job descriptions based on available data
min_samples = min(len(it_job_desc_cleaned), len(salaries_data_cleaned))
it_job_desc_filtered = it_job_desc_cleaned.iloc[:min_samples]
salaries_data_filtered = salaries_data_cleaned.iloc[:min_samples]


In [None]:
# PART 5: Balancing Data Using SMOTE
print("\nApplying SMOTE for Class Imbalance...")
filtered_jobs = salaries_data_filtered['job_title'].value_counts()
valid_jobs = filtered_jobs[filtered_jobs > 1].index
salaries_data_filtered = salaries_data_filtered[salaries_data_filtered['job_title'].isin(valid_jobs)]
x_tfidf_filtered = x_tfidf[:len(salaries_data_filtered)]

smote = SMOTE(random_state=42, k_neighbors=1)  # Adjusted k_neighbors to avoid error with small sample sizes
x_resampled, y_resampled = smote.fit_resample(x_tfidf_filtered, salaries_data_filtered['job_title'])


In [None]:
# PART 6: Model Training and Evaluation
# Step 6.1: Splitting Data into Training and Test Sets
print("\nSplitting Data into Training and Test Sets...")
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
# Step 6.2: Training Logistic Regression Model
print("\nTraining Logistic Regression Model...")
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(x_train, y_train)

In [None]:
# Step 6.3: Evaluating Model Performance
print("\nEvaluating Model Performance...")
y_pred = logreg.predict(x_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))