In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load your dataset (assumed to have columns 'Resume_str' and 'Category')
df = pd.read_csv('Resume/Resume.csv')

# Preprocess the data
def preprocess_text(text):
    # You can add more preprocessing steps based on your specific requirements
    # For now, let's convert text to lowercase, strip accents, and remove special characters
    text = text.lower()
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return text

df['Resume_str'] = df['Resume_str'].apply(preprocess_text)

# Check the column names and data types after preprocessing
print(df.info())

# Feature extraction using TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(df['Resume_str'])

# Preprocess the job description using the same TF-IDF vectorizer
new_job_description = """
Our Client runs a media platform that provides marketing and advertising services through magazines, web, social media and mobile app. Our client is in search of a candidate with outstanding managerial abilities and expertise in handling both accounts an events.

Report To: Operations Manager

The Junior Accounts & Events Executive will assist with and support the delivery and operations of all events and promotions, ensuring all events run smoothly.
This role will include early mornings and late nights, as well as weekend work on a rota basis. The Events Team are a very busy, creative, tight knit group of vibrant professionals who strive to challenge and innovate wherever we can. We are a team of "yes" people who work tirelessly to deliver our events to the highest standard and pride ourselves on going the extra mile. We are a supportive team and have a flexible and collaborative approach to our work.
Daily and Monthly Responsibilities

    Day-to-day administration of events and programs including monitoring vendors, logistics planning, monitoring attendance, resolving issues.
    Overseeing the on the day set up and break down of events as required by the Events Team
    Early mornings will involve managing and coordinating suppliers to ensure that they adhere to event objectives and set up in a safe, secure and timely manner.
    Late nights will involve managing guests to ensure that they leave the venue safely, whilst also coordinating and overseeing suppliers who are breaking down equipment.
    Task administration: Planning, reporting and adherence to internal processes and procedures.
    Office errands support.

Required Skills and Qualifications

    Some event coordination experience.
    Strong interpersonal, co-working and collaborative skills.
    Excellent organizational, communication, negotiation, and multitasking skills
    Strong customer-service oriented attitude
    Aptitude for tech and collaborative / co-working digital tools such as Google Drive. I.e High computer literacy.
    Ability to work early morning and late nights as well as weekends and out of Kampala.
    Confidence to work independently.
    Hands-on and outgoing attitude.

Preferred

    Experience with Event Management, Activations.
    Experience in Hospitality Industry
    Active Interest in the Hospitality Industry: Restaurants, Hotels, Events, Lifestyle.
"""
new_job_description = preprocess_text(new_job_description)
new_job_description_tfidf = tfidf_vectorizer.transform([new_job_description])

# Calculate cosine similarity between the job description and all resumes
cosine_similarities = cosine_similarity(new_job_description_tfidf, X).flatten()

# Add similarity scores to the original dataset
df['Similarity_Score'] = cosine_similarities

# Sort the dataset based on the similarity scores
ranked_df = df.sort_values(by='Similarity_Score', ascending=False)

# Display the ranked dataset
print(ranked_df[['Resume_str', 'Similarity_Score']])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB
None
                                             Resume_str  Similarity_Score
2066           events  public relations leader      ...          0.296624
2128           marketing and special events coordina...          0.264656
2138           senior account manager         profes...          0.208640
2073           public relations and event planning a...          0.200167
2096           human resources administrative assist...          0.198667
...                                                 ...               ...
1727           engineering intern         summary   ...          0.006829
