In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import numpy as np
from gensim import downloader as api
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

df = pd.read_excel('TrainingEminDataSet.xlsx')

columns = ['Title', 'OfferDescription', 'Requirements', 'Responsibilities', 'AdditionalInformation', 'Descriptions']
existing_columns = [col for col in columns if col in df.columns]


# Combine relevant columns into a single text column, handling missing values
df['combined_text'] = df[existing_columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

# Handle empty combined texts
df['combined_text'] = df['combined_text'].apply(lambda x: x if x else 'NA')

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['combined_text'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)]))

# Word2Vec embeddings
word2vec_model = api.load("word2vec-google-news-300")

def sentence_to_vec(sentence):
    words = word_tokenize(sentence)
    vectors = [word2vec_model[word] for word in words if word in word2vec_model.key_to_index]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

df['word2vec'] = df['lemmatized_text'].apply(sentence_to_vec)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['word2vec'].tolist(), df['Label'], test_size=0.2, random_state=42)
X_train = np.array(X_train)
X_test = np.array(X_test)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Tahminler ve deÄŸerlendirme
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9666666666666667
Precision: 1.0
Recall: 0.9375
F1-score: 0.967741935483871
