In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tensorflow as tf
print(tf.__version__)


2.15.0


In [None]:
pip install tensorflow




**training word2vec model on our dataset**

In [3]:
from gensim.models import Word2Vec
import pandas as pd

df =  pd.read_csv("/content/drive/MyDrive/CareerCoach_Dataset.csv")
df['Skills'] = df['Skills'].str.replace('|', ',').str.lower()
tokenized_skills = [skill_set.split() for skill_set in df['Skills']]

# Train the Word2Vec model on the tokenized data
word2vec_model = Word2Vec(sentences=tokenized_skills, vector_size=100, window=5, min_count=1, workers=4)

word2vec_model.save("word2vec_skills.model")


  df['Skills'] = df['Skills'].str.replace('|', ',').str.lower()


Skill extraction

In [4]:
import spacy
import pandas as pd
from spacy.matcher import PhraseMatcher

#spaCy English model
nlp = spacy.load("en_core_web_sm")


skill_df = pd.read_csv("/content/drive/MyDrive/dataset_grad/first_trans_try.csv")
skill_keywords = skill_df["Skills"].astype(str).str.lower().tolist()
ignore_words = set(["me", "and", "or", "i", "myself", "experience", "excellent", "skill", "strong", "good", "be", "using","use", "skills"])
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(text) for text in skill_keywords if text not in ignore_words]
matcher.add("SKILL_PATTERNS", patterns)


def extract_entities_skills_and_bigrams(text):
    doc = nlp(text)
    probable_skills = set()

   #phrasematcher 3shan l multi word skill
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        span_text = span.text.lower().replace(" ", "_").replace(".", "_")
        probable_skills.add(span_text)

    # NER and ngrams
    for ent in doc.ents:
      ent_text = ent.text.lower().replace(" ", "_").replace(".", "_")
      if ent_text not in ignore_words and ent_text in skill_keywords:
        probable_skills.add(ent_text)
    for token1, token2 in zip(doc[:-1], doc[1:]):
        bigram_text = f"{token1.text.lower()} {token2.text.lower()}"
        bigram_key = bigram_text.replace(" ", "_").replace(".", "_")
        if bigram_text in skill_keywords and bigram_key not in ignore_words:
            probable_skills.add(bigram_key)

    # Use PoS tagging to identify nouns and verbs as probable skills
    for token in doc:
        if token.pos_ in ['NOUN', 'VERB']:
            token_text = token.lemma_.lower().replace(" ", "_").replace(".", "_")
            if token_text not in ignore_words and token_text in skill_keywords and token_text not in probable_skills:
                probable_skills.append(token_text)

    #n filter l hagat ele tl3t ba
    final_skills = set()
    for skill in probable_skills:
      if any(skill in multi_word_skill for multi_word_skill in probable_skills if multi_word_skill != skill):
        continue #y3ny lw l'a l skill de mwgoda f mukti word yskipha w myhothash tany
      final_skills.add(skill)
    return list(final_skills)




user_paragraph = "My skills are java, python, C++, machine learning and node js"
probable_skills = extract_entities_skills_and_bigrams(user_paragraph)

print ("Probable Skills:", probable_skills)





Probable Skills: ['c++', 'java', 'node_js', 'machine_learning', 'python']


**use the trained W2V model to transform user skills into vectors**

In [5]:
from gensim.models import Word2Vec
import tensorflow
from tensorflow import keras
from keras.utils import pad_sequences
import numpy as np


word2vec_model = Word2Vec.load("word2vec_skills.model")


def skill_to_vector(skill, model):
    skill = skill.replace('_', ' ')  # Convert back to space-separated
    if skill in model.wv:
        return model.wv[skill]
    else:
        return None

def skills_to_vector_sequence(skill_list, model):
    vectors = [skill_to_vector(skill, model) for skill in skill_list]
    vectors = [vec for vec in vectors if vec is not None]  # Remove skills not in the Word2Vec vocabulary
    return np.array(vectors)

# Example usage with probable_skills extracted from user input
probable_skill_vectors = [skills_to_vector_sequence([skill], word2vec_model) for skill in probable_skills]

# Determine the maximum sequence length
max_seq_length = max(len(skill_seq) for skill_seq in probable_skill_vectors)

# Pad sequences
padded_skill_sequences = pad_sequences(probable_skill_vectors, maxlen=max_seq_length, padding='post', dtype='float32')

print (padded_skill_sequences)
print ("skills",probable_skill_vectors)


[[[ 1.28334478e-01 -3.19948971e-01 -3.87016714e-01 -1.31472483e-01
    2.41616485e-03 -1.40886828e-01 -3.41250658e-01  1.33707196e-01
    6.74414992e-01  2.31944229e-02 -4.42652911e-01  1.04114890e+00
   -1.33193269e-01  5.80707490e-02  4.38426696e-02  4.84390736e-01
   -2.92658389e-01 -1.45168513e-01 -7.19160616e-01 -6.88091159e-01
   -3.21424782e-01 -5.02774179e-01  2.05604807e-01  5.28691232e-01
    9.73938227e-01  1.86664119e-01 -5.38564175e-02  4.57600772e-01
   -2.98110787e-02 -1.74651951e-01 -8.71968865e-01 -1.27052054e-01
    4.35195088e-01 -6.21793196e-02  7.98062146e-01  3.75206321e-01
   -3.47611815e-01 -1.14078082e-01  1.90170407e-02 -1.41919658e-01
    2.64568999e-02 -1.45481462e-02  5.55108011e-01  6.93215728e-02
   -4.56465304e-01 -1.71041191e-01  3.77587080e-01  1.95442997e-02
    1.59004748e-01  1.18379569e+00 -1.93044707e-01 -2.60847509e-01
   -9.11563858e-02 -6.99881494e-01  6.27568901e-01  2.61450976e-01
    1.28587902e-01  3.74561667e-01 -1.25633955e-01  5.20230114

In [6]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming df['Skills'] contains the skill sets for each entry in the dataset
# Transform each skill set into a sequence of vectors
dataset_skill_sequences = []
for skills_str in df['Skills']:
    # Split the skills string into individual skills
    skills_list = skills_str.split('|')  # Adjust split method based on your data formatting
    # Convert skills to vectors
    skill_vectors = skills_to_vector_sequence(skills_list, word2vec_model)
    dataset_skill_sequences.append(skill_vectors)

# Determine the maximum sequence length across all entries in the dataset
max_length_dataset = max(len(skill_seq) for skill_seq in dataset_skill_sequences)

# Pad the sequences for the entire dataset
padded_dataset_sequences = pad_sequences(dataset_skill_sequences, maxlen=max_length_dataset, padding='post', dtype='float32')



**Encoding Job titles**

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
job_titles_encoded = encoder.fit_transform(df[['Job Title']])
job_titles_df = pd.DataFrame(job_titles_encoded, columns=encoder.get_feature_names_out(['Job Title']))



In [8]:
print("Number of padded skill sequences:", len(padded_dataset_sequences))
print("Number of job titles encoded:", len(job_titles_encoded))


Number of padded skill sequences: 374134
Number of job titles encoded: 374134


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tensorflow
from tensorflow import keras
from keras.utils import pad_sequences
import numpy as np

assert len(padded_dataset_sequences) == len(job_titles_encoded), "The number of skill sequences and job titles must match." #error msg

X_train, X_test, y_train, y_test = train_test_split(padded_dataset_sequences, job_titles_encoded, test_size=0.2, random_state=1)



**Model training LSTM**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the LSTM model
model = Sequential()
model.add(LSTM(units=50, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(units=y_train.shape[1], activation='softmax'))  # Output layer

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))
