# Get Dataframe

In [25]:
import pandas as pd
import pickle

def get_upwork_job_descriptions():
    jobs = pd.read_csv('upwork-jobs.csv')
    # retun a list of the column 'description'
    return jobs['description'].tolist()

def get_fiverr_job_descriptions():
    jobs = pd.read_csv('fiverr.csv')
    # retun a list of the column 'name'
    return jobs['name'].tolist()

def get_fiverr_dictionary():
    dictionary = pickle.load(open('fiverr-dictionary.pkl', 'rb'))
    return dictionary

def get_fiverr_corpus():
    corpus = pickle.load(open('fiverr-corpus.pkl', 'rb'))
    return corpus

def get_upwork_dictionary():
    dictionary = pickle.load(open('upwork-dictionary.pkl', 'rb'))
    return dictionary

def get_upwork_corpus():
    corpus = pickle.load(open('upwork-corpus.pkl', 'rb'))
    return corpus


# Preprocessing

In [24]:
import re
import pandas as pd
import spacy
from gensim.models.phrases import Phrases, Phraser
from collections import Counter
from tqdm import tqdm
from gensim.corpora import Dictionary
import pickle


# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Define preprocessing function
def preprocess(text):
    """
    Preprocess a given text by:
    - Lowercasing
    - Removing punctuation
    - Tokenizing
    - Removing stopwords
    - Lemmatizing
    """
    # Lowercase text
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Use spaCy for tokenization and lemmatization
    doc = nlp(text)
    words = [token.lemma_ for token in doc if len(token) > 3 and not token.is_stop]
    
    return words

# Define function to create a final bag of words including unigrams and bigrams
def create_bag_of_words(words, bigram_phraser):
    """
    Create a bag of words that includes both unigrams and bigrams.
    """
    words_with_bigrams = words + bigram_phraser[words]
    bag_of_words = Counter(words_with_bigrams)
    return bag_of_words

# Set the sample texts
sample_texts = get_upwork_job_descriptions()  # Assuming this function is defined elsewhere

# Preprocess all sample texts with a progress bar
preprocessed_corpus = [preprocess(text) for text in tqdm(sample_texts)]

print("Preprocessing done")

# Train bigram model on entire corpus
phrases = Phrases(preprocessed_corpus, min_count=30, threshold=30)  # Adjusted for larger corpus size
bigram_phraser = Phraser(phrases)

print("Bigram model trained")

# Process each preprocessed text and create a list of bags of words
lda_ready_corpus = [create_bag_of_words(cleaned_text, bigram_phraser) for cleaned_text in tqdm(preprocessed_corpus)]

print("Corpus created")

# Create a dictionary representation of the documents
dictionary = Dictionary(lda_ready_corpus)

print("Dictionary created")

# Filter out words that occur in less than 5% of the documents or more than 50% of the documents
dictionary.filter_extremes(no_below=.05, no_above=.5)
corpus = [dictionary.doc2bow(doc) for doc in lda_ready_corpus]

print("Corpus filtered")

pickle.dump(corpus, open('upwork-corpus.pkl', 'wb'))
pickle.dump(dictionary, open('upwork-dictionary.pkl', 'wb'))



100%|██████████| 53058/53058 [08:22<00:00, 105.62it/s]


Preprocessing done
Bigram model trained


100%|██████████| 53058/53058 [00:02<00:00, 21096.92it/s]


Corpus created
Dictionary created
Corpus filtered


# LDA Modeling

In [26]:
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Load the preprocessed dictionary
dictionary = get_upwork_dictionary()
corpus = get_upwork_corpus()

# Define and train the LDA model
n_topics = 8  # Number of topics you want to extract
random_state = 42  # Random state for reproducibility
passes = 50  # Number of passes through the corpus

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=n_topics,
    random_state=random_state,
    passes=passes
)

# Prepare the LDA visualization data
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)

# Display the LDA visualization (for Jupyter Notebooks)
pyLDAvis.display(lda_vis)



  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
