In [1]:
import os
import pandas as pd
import re
from collections import Counter, defaultdict

In [2]:
# Examine the data folder
data_folder = "data"

# List the categories in the data folder
categories = [folder for folder in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, folder))]

# Count the number of .txt files in each category
category_counts = {}
for category in categories:
    txt_files = [file for file in os.listdir(os.path.join(data_folder, category)) if file.endswith(".txt")]
    category_counts[category] = len(txt_files)

print("Categories:", categories)
print("Number of .txt files in each category:", category_counts)

# Check the format of txt files
file_content = {}

for category in categories:
    # Get the first .txt file in the category
    first_file = next(file for file in os.listdir(os.path.join(data_folder, category)) if file.endswith(".txt"))
    
    with open(os.path.join(data_folder, category, first_file), 'r', encoding='utf-8', errors='ignore') as f:
        # Read the first 500 characters of the file for inspection
        file_content[category] = f.read(500)

print("\nFirst .txt file from each category:")
for category, content in file_content.items():
    print(f"\nCategory: {category}\n{'-' * 40}\n{content}\n{'-' * 40}")
    


Categories: ['Sales', 'Accounting_Finance', 'Healthcare_Nursing', 'Engineering']
Number of .txt files in each category: {'Sales': 156, 'Accounting_Finance': 191, 'Healthcare_Nursing': 198, 'Engineering': 231}

First .txt file from each category:

Category: Sales
----------------------------------------
Title: Estate Agency Senior Sales Negotiator
Webindex: 72444142
Company: ESTATE AGENCY PROFESSIONALS
Description: Senior Sales Negotiator Location : Shoreditch Salary : **** depending on experience OTE ********k  car allowance Company : Bridge Shoreditch Job Type : Permanent Brilliant proven opportunity for a confident and experienced Sales Negotiator / Senior Sales Negotiator to join a well established independent estate agent with an excellent reputation and great office atmosphere. As a senio
----------------------------------------

Category: Accounting_Finance
----------------------------------------
Title: Commercial Insurance Underwriter
Webindex: 69092773
Company: Bond Search Sel

In [None]:
# Directory containing subdirectories with data files
base_directory = 'data'

# Get subdirectories and their count
subdirectories = os.listdir(base_directory)
print(f'Total sub categories: {len(subdirectories)}')

# Get list of subdirectories
file_paths = [os.path.join(base_directory, subdirectory, filename) 
              for subdirectory in subdirectories 
              for filename in os.listdir(os.path.join(base_directory, subdirectory))]
print(f'Total job ads: {len(file_paths)}')

# Read in the data file and store as a list of dictionaries
data_records = []
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read().strip() + f"\nCategory: {file_path.split(os.sep)[1]}"
        record = {line.split(": ", 1)[0]: line.split(": ", 1)[1] for line in content.split("\n")}
        data_records.append(record)

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data_records).astype({
    'Title': 'str', 
    'Webindex': 'int', 
    'Company': 'str', 
    'Description': 'str'
})

print(df.info())
df.head()

Total sub categories: 4
Total job ads: 776
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        776 non-null    object
 1   Webindex     776 non-null    int64 
 2   Company      776 non-null    object
 3   Description  776 non-null    object
 4   Category     776 non-null    object
dtypes: int64(1), object(4)
memory usage: 30.4+ KB
None


Unnamed: 0,Title,Webindex,Company,Description,Category
0,Estate Agency Senior Sales Negotiator,72444142,ESTATE AGENCY PROFESSIONALS,Senior Sales Negotiator Location : Shoreditch ...,Sales
1,Export Sales Executive (French & German Fluency),68687567,Next Recruitment Ltd,Previous experience in a similar role is essen...,Sales
2,GRADUATE SALES ENGINEER,68257980,BMS Sales Specialists LLP,GRADUATE SALES ENGINEER Sector: Engineering Co...,Sales
3,Sales Representative / Lead Generator,71168766,BlueTown Online,Job Title: Sales Representative / Lead Generat...,Sales
4,Search Recruitment Consultant Media and Techn...,72441930,Fresh Partnership,Search Recruitment Consultant Media and Techn...,Sales


In [4]:
# Tokenization and processing
token_pattern = r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?"  # Regular expression for word tokenization

# 2. Tokenize each job advertisement description using the specified regular expression
df['Tokenized Description'] = df['Description'].apply(lambda x: [word.lower() for word in re.findall(token_pattern, x)])  # 3. Convert all words to lower case

# 4. Remove words with length less than 2
df['Tokenized Description'] = df['Tokenized Description'].apply(lambda x: [word for word in x if len(word) > 1])

# 5. Remove stopwords using the provided stopwords list
with open("stopwords_en.txt", "r") as f:
    stopwords = set(f.read().splitlines())
df['Tokenized Description'] = df['Tokenized Description'].apply(lambda x: [word for word in x if word not in stopwords])

# 6. Remove words that appear only once based on term frequency
term_frequencies = Counter(word for words in df['Tokenized Description'] for word in words)
df['Tokenized Description'] = df['Tokenized Description'].apply(lambda x: [word for word in x if term_frequencies[word] > 1])

# 7. Remove the top 50 most frequent words based on document frequency
document_frequencies = defaultdict(int)
for words in df['Tokenized Description']:
    for word in set(words):
        document_frequencies[word] += 1
top_50_words = {word for word, freq in Counter(document_frequencies).most_common(50)}
df['Tokenized Description'] = df['Tokenized Description'].apply(lambda x: [word for word in x if word not in top_50_words])

# 8. Save all job advertisement text and information
df.to_csv("processed_job_ads.csv", index=False)

# 9. Build a vocabulary of the cleaned job advertisement descriptions
vocabulary = sorted(set(word for words in df['Tokenized Description'] for word in words))
vocab_dict = {word: idx for idx, word in enumerate(vocabulary)}

In [None]:
# 1. Save the Unigram Vocabulary to vocab.txt
vocab_file_path = "vocab.txt"
with open(vocab_file_path, 'w') as f:
    for word, index in vocab_dict.items():
        f.write(f"{word}:{index}\n")