In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the dataset
data = pd.read_csv('data.csv')

# Preprocess the data
X = data['Skills'].astype(str).values
y = data['Job Title'].astype(str).values

# Tokenize the skills
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# Pad sequences
max_length = 100
X_pad = pad_sequences(X_seq, maxlen=max_length, padding='post')

# One-hot encode the job titles
mlb = MultiLabelBinarizer()
y_onehot = mlb.fit_transform(y.reshape(-1, 1))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_onehot, test_size=0.2, random_state=42)

# Define the GRU model with increased dimensions and more layers
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=300, input_length=max_length),  # Increased output dimension
    GRU(128, return_sequences=True),  # Added another GRU layer
    GRU(64),  # Added another GRU layer
    Dense(y_onehot.shape[1], activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model with more epochs
model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.7688398361206055
