In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset (assumed to have columns 'Resume_str' and 'Category')
df = pd.read_csv('Resume/Resume.csv')

# Check the column names and data types
print(df.info())

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    
    return text

# Apply preprocessing to 'Resume_str' column
df['Resume_str'] = df['Resume_str'].apply(preprocess_text)

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF (Term Frequency-Inverse Document Frequency)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_data['Resume_str'])
X_test = tfidf_vectorizer.transform(test_data['Resume_str'])

# Create labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['Category'])
y_test = label_encoder.transform(test_data['Category'])

# Train a RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print("Classification Report:")
print(classification_report(y_test, predictions))

# Now, you can use the trained model to rank CVs for a given job description

# Preprocess the job description using the same TF-IDF vectorizer
new_job_description = """
Our Client runs a media platform that provides marketing and advertising services through magazines, web, social media and mobile app. Our client is in search of a candidate with outstanding managerial abilities and expertise in handling both accounts an events.

Report To: Operations Manager

The Junior Accounts & Events Executive will assist with and support the delivery and operations of all events and promotions, ensuring all events run smoothly.
This role will include early mornings and late nights, as well as weekend work on a rota basis. The Events Team are a very busy, creative, tight knit group of vibrant professionals who strive to challenge and innovate wherever we can. We are a team of "yes" people who work tirelessly to deliver our events to the highest standard and pride ourselves on going the extra mile. We are a supportive team and have a flexible and collaborative approach to our work.
Daily and Monthly Responsibilities

    Day-to-day administration of events and programs including monitoring vendors, logistics planning, monitoring attendance, resolving issues.
    Overseeing the on the day set up and break down of events as required by the Events Team
    Early mornings will involve managing and coordinating suppliers to ensure that they adhere to event objectives and set up in a safe, secure and timely manner.
    Late nights will involve managing guests to ensure that they leave the venue safely, whilst also coordinating and overseeing suppliers who are breaking down equipment.
    Task administration: Planning, reporting and adherence to internal processes and procedures.
    Office errands support.

Required Skills and Qualifications

    Some event coordination experience.
    Strong interpersonal, co-working and collaborative skills.
    Excellent organizational, communication, negotiation, and multitasking skills
    Strong customer-service oriented attitude
    Aptitude for tech and collaborative / co-working digital tools such as Google Drive. I.e High computer literacy.
    Ability to work early morning and late nights as well as weekends and out of Kampala.
    Confidence to work independently.
    Hands-on and outgoing attitude.

Preferred

    Experience with Event Management, Activations.
    Experience in Hospitality Industry
    Active Interest in the Hospitality Industry: Restaurants, Hotels, Events, Lifestyle.
"""

new_job_description = preprocess_text(new_job_description)
new_job_description_tfidf = tfidf_vectorizer.transform([new_job_description])

# Make predictions for ranking
ranking_scores = rf_classifier.predict_proba(new_job_description_tfidf)[:, 1]

# Add ranking scores to the original dataset
df['Ranking_Score'] = rf_classifier.predict_proba(tfidf_vectorizer.transform(df['Resume_str']))[:, 1]

# Sort the dataset based on the ranking scores
ranked_df = df.sort_values(by='Ranking_Score', ascending=False)

# Display the ranked dataset
print(ranked_df[['Resume_str', 'Ranking_Score']])


[nltk_data] Downloading package stopwords to /home/nalugya-
[nltk_data]     vanessa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nalugya-
[nltk_data]     vanessa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB
None
Accuracy: 0.6599597585513078
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.93      0.84        29
           1       0.79      0.73      0.76        30
           2       0.25      0.12      0.17         8
           3       0.64      0.35      0.45        20
           4       0.29      0.11      0.16        18
           5       0.00      0.00      0.00         6
           6       0.78      0.86      0.82        21
           7       0.70      0.70      0.70        23
           8       0.00      0.00      0.00         2

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                             Resume_str  Ranking_Score
475   bilingual client advocate professional summary...           0.88
501   customer service advocate summary talented cus...           0.86
481   customer service advocate professional summary...           0.85
500   bilingual domestic violence advocate skill wor...           0.83
526   nurse clinician ii medicalcardiac intensive ca...           0.82
...                                                 ...            ...
1190  consultant professional summary selfmotivated ...           0.00
1189  consultant executive profile year experience m...           0.00
1188  consultant executive profile innovative profes...           0.00
1943  construction manager professional summary moti...           0.00
1447  summary kitchen manager six year boh operation...           0.00

[2484 rows x 2 columns]
