In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset (assumed to have columns 'Resume_str' and 'Category')
df = pd.read_csv('Resume/Resume.csv')

# Check the column names and data types
print(df.info())

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    
    return text

# Apply preprocessing to 'Resume_str' column
df['Resume_str'] = df['Resume_str'].apply(preprocess_text)

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create a pipeline for the model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameters for grid search
param_grid = {
    'tfidf__max_features': [5000, 10000],
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(train_data['Resume_str'], train_data['Category'])

# Print the best parameters
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions on the test set
predictions = grid_search.predict(test_data['Resume_str'])

# Evaluate the model
accuracy = accuracy_score(test_data['Category'], predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print("Classification Report:")
print(classification_report(test_data['Category'], predictions))

# Now, you can use the trained model to rank CVs for a given job description

# Preprocess the job description using the same TF-IDF vectorizer
new_job_description = """
Our Client runs a media platform that provides marketing and advertising services through magazines, web, social media and mobile app. Our client is in search of a candidate with outstanding managerial abilities and expertise in handling both accounts an events.

Report To: Operations Manager

The Junior Accounts & Events Executive will assist with and support the delivery and operations of all events and promotions, ensuring all events run smoothly.
This role will include early mornings and late nights, as well as weekend work on a rota basis. The Events Team are a very busy, creative, tight knit group of vibrant professionals who strive to challenge and innovate wherever we can. We are a team of "yes" people who work tirelessly to deliver our events to the highest standard and pride ourselves on going the extra mile. We are a supportive team and have a flexible and collaborative approach to our work.
Daily and Monthly Responsibilities

    Day-to-day administration of events and programs including monitoring vendors, logistics planning, monitoring attendance, resolving issues.
    Overseeing the on the day set up and break down of events as required by the Events Team
    Early mornings will involve managing and coordinating suppliers to ensure that they adhere to event objectives and set up in a safe, secure and timely manner.
    Late nights will involve managing guests to ensure that they leave the venue safely, whilst also coordinating and overseeing suppliers who are breaking down equipment.
    Task administration: Planning, reporting and adherence to internal processes and procedures.
    Office errands support.

Required Skills and Qualifications

    Some event coordination experience.
    Strong interpersonal, co-working and collaborative skills.
    Excellent organizational, communication, negotiation, and multitasking skills
    Strong customer-service oriented attitude
    Aptitude for tech and collaborative / co-working digital tools such as Google Drive. I.e High computer literacy.
    Ability to work early morning and late nights as well as weekends and out of Kampala.
    Confidence to work independently.
    Hands-on and outgoing attitude.

Preferred

    Experience with Event Management, Activations.
    Experience in Hospitality Industry
    Active Interest in the Hospitality Industry: Restaurants, Hotels, Events, Lifestyle.
"""

new_job_description = preprocess_text(new_job_description)

# Make predictions for ranking
ranking_scores = grid_search.predict_proba([new_job_description])[:, 1]

# Add ranking scores to the original dataset
df['Ranking_Score'] = grid_search.predict_proba(df['Resume_str'])[:, 1]

# Sort the dataset based on the ranking scores
ranked_df = df.sort_values(by='Ranking_Score', ascending=False)

# Display the ranked dataset
print(ranked_df[['Resume_str', 'Ranking_Score']])
