# Clean Offers

## Load Data

In [1]:
import pandas as pd

fileName = 'Test_Results'
excel_file = '../' + fileName + '.xlsx'

df = pd.read_excel(excel_file)

## Translate

In [2]:
import langid
from deep_translator import GoogleTranslator

In [None]:
def split_text(text, max_length=5000):
    chunks = []
    start = 0
    
    while start < len(text):
        end = min(start + max_length - 1, len(text))
        
        if end < len(text):
            split_point = end
            while split_point > start and text[split_point] not in {' ', '.'}:
                split_point -= 1
            if split_point == start:
                split_point = end
            chunks.append(text[start:split_point].strip())
            start = split_point
        else:
            chunks.append(text[start:].strip())
            start = len(text)
    
    return chunks

In [None]:
def detect_language(text):
    lang, _ = langid.classify(text)
    return lang

In [3]:
def detect_and_translate(text):
    if pd.isna(text) or text == '-':
        return text
    try:
        detected_language = detect_language(text)
        if detected_language != 'en':
            translator = GoogleTranslator(source='auto', target='en')
            translatedText = ''
            for chunk in split_text(text):
                translatedText += translator.translate(chunk)
            return translatedText
        else:
            return text
    except Exception as e:
        print(e)
        return text

In [4]:
columns_to_translate = ['Descriptions', 'Requirements']
df[columns_to_translate] = df[columns_to_translate].applymap(detect_and_translate)

  df[columns_to_translate] = df[columns_to_translate].applymap(detect_and_translate)


## Fix Line Break Issue

In [5]:
import re

def split_words(text, excluded_words=None):
    if not isinstance(text, str):
        return text
    
    if excluded_words is None:
        excluded_words = []

    placeholders = {}
    for i, word in enumerate(excluded_words):
        placeholder = f"__PLACEHOLDER_{i}__"
        placeholders[placeholder] = word
        text = text.replace(word, placeholder)

    def split_non_excluded(match):
        word = match.group(0)
        return re.sub(r'([a-z])([A-Z])|([a-zA-Z])(\d)|(\d)([a-zA-Z])', r'\1\3\5 \2\4\6', word)
    
    splitted_text = re.sub(r'\S+', split_non_excluded, text)
    
    for placeholder, original_word in placeholders.items():
        splitted_text = splitted_text.replace(placeholder, ' ' + original_word)
    
    splitted_text = re.sub(r'(\W)', r' \1 ', splitted_text)
    splitted_text = re.sub(r'\s+', ' ', splitted_text).strip()
    
    return splitted_text

In [6]:
excluded_words = ['PhD', 'phd', 'Phd', 'phD', 'PHd', 'P.h.d', 'P.h.D', 'P.H.D']
columns_to_fix = ['Descriptions', 'Requirements']

df[columns_to_fix] = df[columns_to_fix].applymap(lambda x: split_words(x, excluded_words=excluded_words))

  df[columns_to_fix] = df[columns_to_fix].applymap(lambda x: split_words(x, excluded_words=excluded_words))


## Filter Offers

In [7]:
import joblib

model = joblib.load('random_forest_model.pkl')

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_text = df['Descriptions'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]))

In [9]:
import numpy as np
from gensim import downloader as api

# Word2Vec embeddings
word2vec_model = api.load("word2vec-google-news-300")

def sentence_to_vec(sentence):
    words = word_tokenize(sentence)
    vectors = [word2vec_model[word] for word in words if word in word2vec_model.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

In [10]:
word2vec = lemmatized_text.apply(sentence_to_vec)

In [11]:
df['Predictions'] = model.predict(word2vec.tolist())

In [12]:
df = df[df['Predictions'] == 1]
df = df.drop(columns=['Predictions'])

## Save Data

In [13]:
df.to_excel('../' + fileName + '_Translated.xlsx', index=False, engine='openpyxl')