In [4]:
import nltk
import string
import re
import inflect
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import pandas as pd
from collections import Counter
import spacy
import requests
import io
import csv

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/louezetheianilicirsaldua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/louezetheianilicirsaldua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/louezetheianilicirsaldua/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/louezetheianilicirsaldua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Functions for text cleaning and preprocessing
def importdata(url):
    response = requests.get(url)
    response.raise_for_status()
    raw_text = response.text

    # Requote rows
    rows = []
    reader = csv.reader(io.StringIO(raw_text), delimiter=',', quotechar='"', skipinitialspace=True)
    max_cols = 0
    for row in reader:
        if len(row) > max_cols:
            max_cols = len(row)
        rows.append(row)

    # Pad rows so all have equal length
    for row in rows:
        if len(row) < max_cols:
            row += [None] * (max_cols - len(row))

    df = pd.DataFrame(rows[1:], columns=rows[0])
    print(f"Loaded DataFrame with {len(df)} rows and {len(df.columns)} columns.")
    return df

    return raw_text

def text_cleaning(text):
    text = text.lower()                 # Force lowercase
    text = re.sub(r'\d+', '', text)     # Remove numbers
    text = text.replace('\r\n', '\n').replace('\r', '\n')

    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)   # Remove punctuation
    text = " ".join(text.split())       # Rem whitespace

    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    text = [word for word in word_tokens if word.lower()
                     not in stop_words] # Rem stop words

    word_tokens = word_tokenize(text)   # Tokenize
    text = [stemmer.stem(word) for word in word_tokens]

    word_tokens = word_tokenize(text)   # Lemmatization
    text = [lemmatizer.lemmatize(word) for word in word_tokens]

    # Create new column for categorized job positions
    text["categorized_title"] = text["title"]
    
def populate_categ_title(title):
    doc = nlp(title)
    nouns = [token.text.lower() for token in doc if token.pos_ in ['NOUN', 'PROPN']]
    if not nouns:
        return 'other'
    # Return most frequent noun
    return Counter(nouns).most_common(1)[0][0]


In [6]:
url = "https://raw.githubusercontent.com/MaharLeika18/Data-Mining---Python/refs/heads/Loue/Midterms_Act1/MidtermsExam/Jobs.csv"
df = importdata(url)
print(df.columns)
print(df.head())

Loaded DataFrame with 790 rows and 5 columns.
Index(['',
       'title                                                                   ',
       'company                                                   ',
       'announcement                                               ',
       'description'],
      dtype='object')
     title                                                                     \
0  0  Senior Analyst, Data Science and Analytics    ...                         
1  1  Senior Data Scientist                         ...                         
2  2  Lead Data Science Analyst                     ...                         
3  3  Data Science Intern                           ...                         
4  4  Data Scientist                                ...                         

  company                                                     \
0  TransUnion                                    ...           
1  Grubhub Holdings, Inc.                        ...         

In [None]:
df = text_cleaning()

In [None]:
df['categorized_title'] = df['title'].apply(populate_categ_title)