In [1]:
import nltk
import string
import re
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import pandas as pd
from collections import Counter
import spacy
import requests
import io
import csv
import matplotlib.pyplot as plt

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

In [2]:
def importdata(url):
    response = requests.get(url)
    response.raise_for_status()
    raw_text = response.text

    # Parse CSV 
    reader = csv.reader(io.StringIO(raw_text), delimiter=',', quotechar='"', skipinitialspace=True)
    rows = list(reader)
    max_cols = max(len(r) for r in rows)
    for r in rows:
        while len(r) < max_cols:
            r.append(None)
    df = pd.DataFrame(rows[1:], columns=rows[0])

    print(f"Loaded DataFrame: {len(df)} rows × {len(df.columns)} columns")
    return df

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Handle text formatting and non-text values
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize, remove stopwords, stem + lemmatize
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return " ".join(tokens)

# Categorize jobs into umbrella titles for easier processing & reading
def categorize_title(title):
    title = str(title).lower()

    if re.search(r'\bscientist(s)?\b', title):
        return 'Data Scientist'
    elif re.search(r'\banalyst(s)?\b', title):
        return 'Data Analyst'
    elif re.search(r'\bengineer(s)?\b', title):
        return 'Data Engineer'
    elif re.search(r'\bengineer(s)?\b', title):
        return 'Data Engineer'
    elif re.search(r'\bcloud(s)?\b', title):
        return 'Cloud Computing'
    elif re.search(r'\bintern(ship|ships)?\b', title):
        return 'Data Science Intern'
    elif re.search(r'\bspecialist(s)?\b', title):
        return 'Data Specialist'
    elif re.search(r'\bmachine learning\b', title) or re.search(r'\bml\b', title):
        return 'Machine Learning Engineer'
    elif re.search(r'\bbusiness intelligence\b', title) or re.search(r'\bbi\b', title):
        return 'Business Intelligence Analyst'
    elif re.search(r'\bconsultant\b', title):
        return 'Data Consultant'
    elif re.search(r'\bmanager\b', title):
        return 'Data Manager'
    else:
        return 'Other Data Science Role'



In [3]:
url = "https://raw.githubusercontent.com/MaharLeika18/Data-Mining---Python/refs/heads/Loue/Midterms_Act1/MidtermsExam/Jobs.csv"
df = importdata(url)

df["categorized_title"] = df["title"].apply(categorize_title)
df["description"] = df["description"].apply(clean_text)

Loaded DataFrame: 790 rows × 5 columns


In [None]:
# Inspect columns and titles
print(df.columns)
print(df[["title", "categorized_title"]].head(20))

Index(['', 'title', 'company', 'announcement', 'description',
       'categorized_title'],
      dtype='object')
                                                title        categorized_title
0          Senior Analyst, Data Science and Analytics             Data Analyst
1                               Senior Data Scientist           Data Scientist
2                           Lead Data Science Analyst             Data Analyst
3                                 Data Science Intern      Data Science Intern
4                                      Data Scientist           Data Scientist
5           Senior Solutions Architect (Data Science)  Other Data Science Role
6                               Senior Data Scientist           Data Scientist
7   Data Scientist - Research, Development & Const...           Data Scientist
8                 Data Science Manager - S&A Strategy             Data Manager
9                     Senior/Principal Data Scientist           Data Scientist
10                

In [9]:
# Export dataframe as csv
df.to_csv('ProcessedJobs.csv')