In [6]:
# Data Structures
import numpy  as np
import pandas as pd
#import geopandas as gpd
import json

# Corpus Processing
import re
# import nltk.corpus
from unidecode                        import unidecode
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize                    import word_tokenize
from nltk import pos_tag
from nltk                             import SnowballStemmer

from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing            import normalize

# K-Means
from sklearn import cluster

# Visualization and Analysis
import matplotlib.pyplot  as plt
import matplotlib.cm      as cm
import seaborn            as sns
from sklearn.metrics                  import silhouette_samples, silhouette_score
from wordcloud                        import WordCloud

In [1]:
import os
os.chdir(r"c:\Users\britt\Desktop\YH\Applicerad AI\job_discrimination")

In [2]:
def read_file(file_name):
    """
    This function will read the text files passed & return the list
    """
    with open(file_name, "r", encoding="utf-8") as f:
        words = f.read()
    return words

In [3]:
corpus = []
for bulletin in os.listdir("data/cleaned_data/job_bulletins/labeled"):
    document = read_file(f"data/cleaned_data/job_bulletins/labeled/{bulletin}").replace("\n", " ")
    corpus.append(document)

In [5]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def penn_to_wn(tag):
    return get_wordnet_pos(tag)

In [7]:
stemmer = WordNetLemmatizer()
preprocessed_corpus = []

for i, document in enumerate(corpus):
    remove_https = re.sub(r"http\S+", "", document)
    remove_com = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
    remove_numbers_punctuations = re.sub(r"[^a-zA-Z]+", " ", remove_com) 
    pattern = re.compile(r'\s+') 
    remove_extra_whitespaces = re.sub(pattern, ' ', remove_numbers_punctuations)
    only_ascii = unidecode(remove_extra_whitespaces)
    doc = only_ascii.lower()

    list_of_tokens = word_tokenize(doc)
    list_of_tokens_pos = pos_tag(list_of_tokens)
    list_of_tokens_wn_pos = [(token[0], penn_to_wn(token[1])) for token in list_of_tokens_pos if token[0] not in stopwords.words("english")]
    list_of_lemmas = [stemmer.lemmatize(token[0], token[1]) if token[1] != "" else stemmer.lemmatize(token[0]) for token in list_of_tokens_wn_pos]
    list_of_lemmas = [lemma for lemma in list_of_lemmas]
    
    preprocessed_corpus.append(" ".join(list_of_lemmas))

In [8]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(preprocessed_corpus)
tf_idf = pd.DataFrame(data = X.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

final_df = tf_idf

print(f"{final_df.shape[0]} rows")
final_df.T.nlargest(5, 0)

177 rows


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,167,168,169,170,171,172,173,174,175,176
call,0.525281,0.0,0.0,0.0,0.022623,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
center,0.404087,0.0,0.0,0.026467,0.0,0.018103,0.0,0.0,0.0,0.203857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
director,0.182525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
city,0.171737,0.140959,0.118225,0.118109,0.073225,0.051931,0.106779,0.168511,0.131189,0.129959,...,0.13927,0.143372,0.135582,0.150529,0.123711,0.097963,0.149205,0.177291,0.124283,0.082726
least,0.157869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.024994,0.0,0.0,0.0,0.0,0.0


In [9]:
final_df.head()

Unnamed: 0,ab,abatement,abbreviation,abilities,ability,able,absence,absorption,abuse,ac,...,writing,xv,xvi,yard,year,yearly,yield,youth,zero,zone
0,0.0,0.0,0.0,0.0,0.019661,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.068695,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.020851,0.0,0.074444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.030206,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.015923,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.034772,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.017471,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.050618,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.016849,0.037258,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.032544,0.0,0.0,0.0,0.0,0.0
