# Upwork Job Postings Dataset Analysis

## Dataset Overview

In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('/kaggle/input/upwork-job-postings-dataset-2024-50k-records/upwork-jobs.csv')
print(df.head())

                                               title  \
0  Experienced Media Buyer For Solar Pannel and R...   
1                               Full Stack Developer   
2                                    SMMA Bubble App   
3             Talent Hunter Specialized in Marketing   
4                                      Data Engineer   

                                                link  \
0  https://www.upwork.com/jobs/Experienced-Media-...   
1  https://www.upwork.com/jobs/Full-Stack-Develop...   
2  https://www.upwork.com/jobs/SMMA-Bubble-App_%7...   
3  https://www.upwork.com/jobs/Talent-Hunter-Spec...   
4  https://www.upwork.com/jobs/Data-Engineer_%7E0...   

                                         description  \
0  We’re looking for a talented and hardworking a...   
1  Job Title: Full Stack DeveloperWe are seeking ...   
2  I need someone to redesign my bubble.io site t...   
3  Join Our Growing Team!We are an innovative com...   
4  We are looking for a resource who can work 

## In-Demand Skills Analysis - Initial Approach

In [2]:

from collections import Counter
import re

# Function to clean and extract keywords from job titles
def extract_keywords(title):
    # Convert to lowercase
    title = title.lower()
    # Remove special characters and numbers
    title = re.sub('[^a-z\s]', '', title)
    # Split into words
    words = title.split()
    return words

# Apply the function to each title and flatten the list of lists
keywords = [keyword for title in df['title'] for keyword in extract_keywords(title)]

# Count the frequency of each keyword
keyword_counts = Counter(keywords)

# Get the most common 20 keywords
most_common_keywords = keyword_counts.most_common(20)
most_common_keywords


[('for', 15773),
 ('and', 7724),
 ('to', 5922),
 ('a', 5761),
 ('needed', 3799),
 ('website', 3579),
 ('developer', 3387),
 ('expert', 3086),
 ('with', 2906),
 ('in', 2902),
 ('video', 2829),
 ('design', 2749),
 ('designer', 2667),
 ('need', 2255),
 ('specialist', 1970),
 ('amp', 1918),
 ('of', 1834),
 ('data', 1782),
 ('looking', 1782),
 ('app', 1728)]

## In-Demand Skills Analysis - Refined Approach

In [3]:

# Define a list of common non-informative words to filter out
non_informative_words = set([
    'for', 'and', 'to', 'a', 'with', 'in', 'of', 'amp', 'the', 'is', 'on', 'at', 'as', 'by', 'we', 'are', 'our', 'be',
    'this', 'who', 'from', 'will', 'an', 'have', 'that', 'not', 'or', 'but', 'your', 'all', 'can', 'has', 'more', 'any',
    'need', 'needed', 'looking', 'expert', 'specialist', 'job', 'work', 'project', 'team'
])

# Redefine the keyword extraction function to filter out non-informative words
def extract_refined_keywords(title):
    title = title.lower()  # Convert to lowercase
    title = re.sub('[^a-z\s]', '', title)  # Remove special characters and numbers
    words = title.split()  # Split into words
    # Filter out non-informative words
    filtered_words = [word for word in words if word not in non_informative_words]
    return filtered_words

# Apply the refined extraction function to each title and flatten the list of lists
refined_keywords = [keyword for title in df['title'] for keyword in extract_refined_keywords(title)]

# Count the frequency of each refined keyword
refined_keyword_counts = Counter(refined_keywords)

# Get the most common 20 refined keywords
most_common_refined_keywords = refined_keyword_counts.most_common(20)
most_common_refined_keywords


[('website', 3579),
 ('developer', 3387),
 ('video', 2829),
 ('design', 2749),
 ('designer', 2667),
 ('data', 1782),
 ('app', 1728),
 ('media', 1700),
 ('marketing', 1684),
 ('editor', 1611),
 ('social', 1580),
 ('youtube', 1507),
 ('content', 1504),
 ('assistant', 1471),
 ('web', 1417),
 ('google', 1414),
 ('create', 1398),
 ('wordpress', 1390),
 ('manager', 1362),
 ('business', 1307)]

In [4]:
# Assuming the dataset is loaded into a DataFrame named `df`
df_fixed_price = df[df['is_hourly'] == False].dropna(subset=['budget'])

budget_quartiles = df_fixed_price['budget'].quantile([0.25, 0.5, 0.75])
def categorize_budget(budget):
    if budget <= budget_quartiles[0.25]:
        return 'low'
    elif budget <= budget_quartiles[0.5]:
        return 'medium'
    elif budget <= budget_quartiles[0.75]:
        return 'high'
    else:
        return 'very_high'
df_fixed_price['budget_category'] = df_fixed_price['budget'].apply(categorize_budget)

In [5]:
budget_quartiles = df_fixed_price['budget'].quantile([0.25, 0.5, 0.75])
df_fixed_price['budget_category'] = df_fixed_price['budget'].apply(categorize_budget)


In [6]:
# Make sure you have the 're' module imported for this step
df_fixed_price['title_cleaned'] = df_fixed_price['title'].apply(lambda x: re.sub('[^a-z\s]', '', x.lower()))
tfidf = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf.fit_transform(df_fixed_price['title_cleaned']).toarray()


In [7]:
le = LabelEncoder()
y = le.fit_transform(df_fixed_price['budget_category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
clf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
clf.fit(X_train, y_train)


In [9]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.3609142857142857
              precision    recall  f1-score   support

        high       0.82      0.01      0.02       818
         low       0.45      0.36      0.40      1302
      medium       0.31      0.69      0.43      1270
   very_high       0.47      0.23      0.31       985

    accuracy                           0.36      4375
   macro avg       0.51      0.32      0.29      4375
weighted avg       0.48      0.36      0.32      4375

