In [8]:
import pandas as pd
import numpy as np
import re
import os
import datetime
import matplotlib.pyplot as plt
import pymysql
import sqlalchemy as alch
from dotenv import load_dotenv
from getpass import getpass

In [9]:
load_dotenv()

True

In [10]:
sql_password = os.getenv("Password")

In [82]:
import ast
df = pd.read_csv('../data/us_jobposts.csv',encoding='utf-8')

# Make the array column an actual array
df['extracted_skills'] = df['extracted_skills'].apply(ast.literal_eval)

In [83]:
def get_engine():
    
    sql_password = os.getenv("Password")  
    dbName = "us_jobposts"
    connectionData = f"mysql+pymysql://root:{sql_password}@localhost:3306/{dbName}?charset=utf8mb4&use_unicode=1"
    engine = alch.create_engine(connectionData)
    return engine

In [84]:
# Inserting skills

def get_skills_with_ids():
    unique_skills = set()
    for index, row in df.iterrows():
        skills_lst = row['extracted_skills']
        for skill in skills_lst:
            if skill:
                unique_skills.add(skill)
    
    unique_skills_lst = list(unique_skills)
    skills_with_ids = [(index+1, skill) for index, skill in enumerate(unique_skills_lst)]
    return skills_with_ids


skills_with_ids = get_skills_with_ids()

# Create dataframe to make it easier to insert
df_skills_with_ids = pd.DataFrame(skills_with_ids, columns=['id', 'skill'])

# Load skills into its own table
df_skills_with_ids.to_sql("skills", if_exists="append", con=engine, index=False)

131

In [86]:
# Inserting jobs
df.drop('extracted_skills', axis=1)
df_for_insert = df.copy()
del df_for_insert['extracted_skills']
df_for_insert.to_sql("jobs", if_exists="append", con=engine, index=False)    

19915

In [87]:
# Insert jobs x skills

skills_map = {item[1]:item[0] for item in skills_with_ids}

job_id_to_skills_lst = []

counter = 0

for _, row in df.iterrows():
    job_id = row['job_id']
    skills = row['extracted_skills']
    for skill in skills:
        skill_id = skills_map[skill]
        if not skill_id:
            print(skill)
            break
        job_id_to_skills_lst.append((job_id, skill_id))

df_job_id_to_skills = pd.DataFrame(job_id_to_skills_lst, columns=['job_id', 'skill_id'])
df_job_id_to_skills.to_sql("jobs_x_skills", if_exists="append", con=engine, index=False)

62391

## Other stuff

In [None]:
# 1. Count the most in-demand skills
skills_demand = skills_df['extracted_skills'].explode().value_counts()
skills_demand

In [None]:
# 3. Frequency of skill sets for Junior level
ba_junior_skills = skills_df[skills_df['role'] == 'Business Analyst Junior']['extracted_skills'].explode().value_counts()
ba_junior_skills.head()

In [None]:
ba_junior_skills = skills_df[skills_df['role'] == 'Business Analyst Junior']['extracted_skills'].explode().value_counts()
ba_junior_skills.head()

In [None]:
mid_senior_skills = skills_df[skills_df['experience_level'] == 'Mid-Senior']['extracted_skills'].explode().value_counts()
mid_senior_skills.head(5)

In [None]:
senior_skills = skills_df[skills_df['experience_level'] == 'Senior']['extracted_skills'].explode().value_counts()
senior_skills.head(5)

## Change DATASET

In [None]:
df = pd.read_csv('../data/spain_jobposts.csv')

#### Keyword Clustering and Skill Grouping:

Create clusters or categories of skills that often appear together in job postings.

In [None]:
# To perform keyword clustering and skill grouping, we can use a clustering algorithm like K-means on the skills data.
# We'll first load the dataset, process the 'extracted_skills' column, and then apply K-means clustering.

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from ast import literal_eval

# Process the 'extracted_skills' column
# Convert the string representation of lists into actual lists
df['extracted_skills'] = df['extracted_skills'].apply(literal_eval)

# Join the lists of skills into a single string per row
df['skills_str'] = df['extracted_skills'].apply(lambda x: ' '.join(x))

# Apply TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['skills_str'])

# Apply K-means clustering
num_clusters = 5  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# Assign the cluster labels to the DataFrame
df['cluster'] = kmeans.labels_


In [None]:
df

In [None]:
from sklearn.decomposition import PCA
# Apply PCA to reduce dimensions to 2D for visualization
pca = PCA(n_components=2)
reduced_X = pca.fit_transform(X.toarray())

# Plotting
plt.figure(figsize=(10, 8))
plt.scatter(reduced_X[:, 0], reduced_X[:, 1], c=df['cluster'], cmap='viridis', marker='o')
plt.title('Skill Clusters')
plt.xlabel('PCA Feature 1')
plt.ylabel('PCA Feature 2')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
# 4. Geographical distribution of job postings
location_distribution = df['location'].value_counts()
location_distribution

In [None]:
# 5. Ranking companies by number of job postings
company_ranking = df['company_name'].value_counts()
company_ranking

In [None]:
# 6. Average number of skills required per company
df['num_skills'] = df['extracted_skills'].apply(lambda x: len(x))
avg_skills_company = df.groupby('company_name')['num_skills'].mean()



In [None]:
# 7. Distribution of job roles
role_distribution = df['role'].value_counts()

In [None]:
# 8. Experience levels required for each role
experience_role = df.groupby('role')['experience_level'].value_counts()