In [None]:
# Import Necessary Libraries
import warnings
import os
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from imblearn.combine import SMOTETomek
from tqdm import tqdm
from langchain_groq import ChatGroq
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import re
from imblearn.under_sampling import RandomUnderSampler
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

warnings.filterwarnings("ignore")

# FUTURISTIC PLOT STYLES - LOADED AT THE BEGINNING
plt.style.use('dark_background')

# Set up LLM with ChatGroq
llm = ChatGroq(
    model="llama3-8b-8192",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# PART 1: Load the Data
# Loading datasets
salaries_path = '../data/json/salaries.json'
salaries_data = pd.read_json(salaries_path)

In [None]:
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
api_key = os.getenv('groq_api_key')
import os
from langchain_groq import ChatGroq

import warnings
warnings.filterwarnings("ignore")
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama3-8b-8192",
    api_key=api_key,
    temperature=0, 
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [None]:
from langchain.prompts.prompt import PromptTemplate

In [6]:
test_prompt = "What is the capital of France?"
try:
    response = llm.generate(test_prompt)
    print("API Test Successful, Response:", response)
except Exception as e:
    print("API Test Failed. Please check your API key and network connection.", e)


API Test Failed. Please check your API key and network connection. Got unknown type W


In [None]:
# PART 2: Data Cleaning
print("\nCleaning data...")
# Drop duplicates, handle missing values, and perform any necessary preprocessing
salaries_data_cleaned = salaries_data.dropna()

# Removing outliers based on salary (e.g., removing top/bottom 1% if applicable)
q_low = salaries_data_cleaned['salary'].quantile(0.01)
q_high = salaries_data_cleaned['salary'].quantile(0.99)
salaries_data_cleaned = salaries_data_cleaned[(salaries_data_cleaned['salary'] > q_low) & (salaries_data_cleaned['salary'] < q_high)]

# Converting work_year to datetime format
salaries_data_cleaned['work_year'] = pd.to_datetime(salaries_data_cleaned['work_year'], format='%Y', errors='coerce')

# Cleaning Job Titles
print("\nCleaning Job Titles...")
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

salaries_data_cleaned['job_title_cleaned'] = salaries_data_cleaned['job_title'].str.lower()
salaries_data_cleaned['job_title_cleaned'] = salaries_data_cleaned['job_title_cleaned'].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Remove punctuations
salaries_data_cleaned['job_title_cleaned'] = salaries_data_cleaned['job_title_cleaned'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop_words]))  # Lemmatization and stop word removal

# Group Rare Job Titles
print("\nGrouping Rare Job Titles...")
job_title_counts = salaries_data_cleaned['job_title_cleaned'].value_counts()
threshold = 50  # Threshold to group rare job titles
salaries_data_cleaned['job_title_grouped'] = salaries_data_cleaned['job_title_cleaned'].apply(
    lambda x: x if job_title_counts[x] >= threshold else 'Other'
)

# Using LLM to Group Similar Job Titles
print("\nUsing LLM to Group Similar Job Titles...")
job_titles_to_group = salaries_data_cleaned['job_title_grouped'].unique().tolist()
job_title_prompt = f"Group the following job titles based on similarity: {job_titles_to_group}."

try:
    # The LLM expects a list of dictionaries in a specific format, we'll use a simpler text-based prompt
    response = llm.generate([{"role": "user", "content": job_title_prompt}])
    job_title_groups = response[0]['choices'][0]['message']['content']
    print("LLM Suggested Job Title Groups:", job_title_groups)
except Exception as e:
    print("[ERROR] Could not generate job title groups using LLM.", e)
    job_title_groups = "No grouping available due to an error."


# Encoding Categorical Columns for Model Compatibility
print("\nEncoding Categorical Features...")
categorical_columns = ['experience_level', 'employment_type', 'job_title_grouped', 'employee_residence', 'company_location', 'company_size']
label_encoders = {}
for col in tqdm(categorical_columns, desc="Encoding categorical features"):
    le = LabelEncoder()
    salaries_data_cleaned[col] = le.fit_transform(salaries_data_cleaned[col])
    label_encoders[col] = le


In [None]:
# PART 3: Exploratory Data Analysis (EDA)
print("\nPerforming Exploratory Data Analysis...")
# Visualize salary distribution
plt.figure(figsize=(10, 6))
salaries_data_cleaned['salary'].plot(kind='hist', title='Salary Distribution', color='green')
plt.xlabel('Salary')
plt.show()

# Plot job title counts to understand class imbalance (Filtered for Readability)
top_n = 20
top_job_titles = salaries_data_cleaned['job_title_grouped'].value_counts().nlargest(top_n)
plt.figure(figsize=(12, 6))
top_job_titles.plot(kind='bar', title='Top Job Titles Counts', color='cyan')
plt.xlabel('Job Title')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Correlation Heatmap (Removing non-numeric columns to avoid errors)
plt.figure(figsize=(12, 8))
numeric_cols = salaries_data_cleaned.select_dtypes(include=[np.number])
sns.heatmap(numeric_cols.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Box Plot for Salary by Experience Level
plt.figure(figsize=(12, 6))
sns.boxplot(data=salaries_data_cleaned, x='experience_level', y='salary', palette='viridis')
plt.title('Salary by Experience Level')
plt.xlabel('Experience Level')
plt.ylabel('Salary')
plt.show()

# Count Plot for Employment Type
plt.figure(figsize=(10, 6))
sns.countplot(data=salaries_data_cleaned, x='employment_type', palette='magma')
plt.title('Count of Employment Types')
plt.xlabel('Employment Type')
plt.ylabel('Count')
plt.show()

# Pair Plot
sns.pairplot(salaries_data_cleaned[['salary', 'remote_ratio', 'company_size']].dropna(), diag_kind='kde')
plt.show()

# Bar Plot of Top 10 Locations
top_locations = salaries_data_cleaned['employee_residence'].value_counts().nlargest(10)
plt.figure(figsize=(12, 6))
top_locations.plot(kind='bar', title='Top 10 Job Locations', color='teal')
plt.xlabel('Location')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Violin Plot for Salary vs. Job Title Grouped
plt.figure(figsize=(14, 8))
sns.violinplot(data=salaries_data_cleaned[salaries_data_cleaned['job_title_grouped'].isin(top_job_titles.index)], x='job_title_grouped', y='salary', palette='cubehelix')
plt.title('Salary Distribution by Job Title Grouped')
plt.xlabel('Job Title Grouped')
plt.ylabel('Salary')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Scatter Plot for Salary vs. Work Year
plt.figure(figsize=(10, 6))
plt.scatter(salaries_data_cleaned['work_year'].dt.year, salaries_data_cleaned['salary'], color='orange')
plt.title('Salary vs. Work Year')
plt.xlabel('Work Year')
plt.ylabel('Salary')
plt.tight_layout()
plt.show()

# Pie Chart for Experience Level Distribution
experience_level_counts = salaries_data_cleaned['experience_level'].value_counts()
plt.figure(figsize=(8, 8))
experience_level_counts.plot(kind='pie', autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightgreen', 'coral', 'plum'])
plt.title('Experience Level Distribution')
plt.ylabel('')
plt.show()

In [None]:
# PART 4: Train/Test Split (Before Data Balancing to Avoid Data Leakage)
print("\nSplitting Data into Training and Test Sets...")
x_train_full, x_test, y_train_full, y_test = train_test_split(
    salaries_data_cleaned[['job_title_grouped', 'experience_level', 'employment_type', 'remote_ratio', 'company_size']], salaries_data_cleaned['salary'], test_size=0.2, random_state=42
)

In [None]:
# PART 5: Feature Engineering
print("\nFeature Engineering...")
# Creating Word2Vec embeddings for job titles
print("\nTraining Word2Vec model...")
job_titles = [title.split() for title in x_train_full['job_title_grouped'].astype(str)]
word2vec_model = Word2Vec(sentences=job_titles, vector_size=100, window=5, min_count=1, workers=4)

def get_word2vec_vector(text, model):
    words = text.split()
    vector = np.mean([model.wv[word] for word in words if word in model.wv] or [np.zeros(model.vector_size)], axis=0)
    return vector

x_train_full['job_title_embedding'] = x_train_full['job_title_grouped'].astype(str).apply(lambda x: get_word2vec_vector(x, word2vec_model))
x_test['job_title_embedding'] = x_test['job_title_grouped'].astype(str).apply(lambda x: get_word2vec_vector(x, word2vec_model))

# Combine all features into a single array
x_train = np.hstack([np.vstack(x_train_full['job_title_embedding'].values), x_train_full.drop(columns=['job_title_grouped', 'job_title_embedding']).values])
x_test = np.hstack([np.vstack(x_test['job_title_embedding'].values), x_test.drop(columns=['job_title_grouped', 'job_title_embedding']).values])


In [None]:
# PART 6: Data Balancing (Oversampling/Undersampling)
print("\nApplying Combined Oversampling and Undersampling for Class Imbalance...")
# Balancing only the training data to avoid data leakage
try:
    rus = SMOTETomek(random_state=42, sampling_strategy='auto')
    x_train_balanced, y_train_balanced = rus.fit_resample(x_train, y_train_full)
except ValueError as e:
    print(f"\n[ERROR] {e}")
    print("\nConsider reducing 'n_neighbors' in SMOTE or use a simpler resampling strategy.")
    # Alternative: Using RandomUnderSampler if SMOTETomek fails
    rus_simple = RandomUnderSampler(random_state=42)
    x_train_balanced, y_train_balanced = rus_simple.fit_resample(x_train, y_train_full)


In [None]:
# PART 7: Model Training
print("\nTraining RandomForest Model...")
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train_balanced, y_train_balanced)


In [None]:
# PART 8: Model Evaluation
print("\nEvaluating Model Performance...")
y_pred = rf.predict(x_test)

# Using F1 Score, Precision, Recall, and Confusion Matrix instead of Accuracy
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# PART 9: Optional - Re-label or Combine Classes for Class Imbalance
print("\nConsidering Re-labeling or Combining Classes for Improved Balance...")
# Optionally re-label classes or merge similar categories to improve class balance
class_counts = pd.Series(y_train_balanced).value_counts()
rare_classes = class_counts[class_counts < 10].index
y_train_balanced = ['Other' if label in rare_classes else label for label in y_train_balanced]
y_test = ['Other' if label in rare_classes else label for label in y_test]


In [None]:
# PART 11: Optional - Using LLM for Insights and Improvements
print("\nUsing LLM for Model and Feature Insights...")
# Requesting LLM analysis on model performance
performance_prompt = f"The F1 Score is {f1}, Precision is {precision}, Recall is {recall}. What hyperparameters should be tuned to improve this RandomForest model?"
try:
    llm_performance_feedback = llm.generate([performance_prompt])[0]['choices'][0]['message']['content']
    print("LLM Analysis Feedback:", llm_performance_feedback)
except TypeError as e:
    print("[ERROR] Could not generate insights using LLM.", e)
    llm_performance_feedback = "No feedback available due to an error."
