In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Navigate to your dataset path
import os
os.listdir('/content/drive/My Drive')


Mounted at /content/drive


['WhatsApp Chat with Akila (1).txt',
 'WhatsApp Chat with Akila.txt',
 'VID-20201013-WA0000.mp4',
 'Notes_211015_195138.pdf',
 'WhatsApp Chat with Peach.txt',
 'econote.db',
 'my-cv.pdf',
 'jobs_dataset_with_features.csv',
 'Colab Notebooks',
 'data']

In [2]:
import pandas as pd

# Adjust the file path to match your dataset location
file_path = '/content/drive/My Drive/jobs_dataset_with_features.csv'

# Load the CSV into a DataFrame
df = pd.read_csv(file_path)

# Preview the first few rows
print(df.head())


                        Role  \
0       Social Media Manager   
1     Frontend Web Developer   
2    Quality Control Manager   
3  Wireless Network Engineer   
4         Conference Manager   

                                            Features  
0  5 to 15 Years Digital Marketing Specialist M.T...  
1  2 to 12 Years Web Developer BCA HTML, CSS, Jav...  
2  0 to 12 Years Operations Manager PhD Quality c...  
3  4 to 11 Years Network Engineer PhD Wireless ne...  
4  1 to 12 Years Event Manager MBA Event planning...  


In [3]:
# Check for null values and duplicates
print(df.info())
print(df.isnull().sum())

# Drop nulls and duplicates if needed
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Ensure columns are named properly
df.columns = ['Role', 'Features']

print("Cleaned dataset:")
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615940 entries, 0 to 1615939
Data columns (total 2 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   Role      1615940 non-null  object
 1   Features  1615940 non-null  object
dtypes: object(2)
memory usage: 24.7+ MB
None
Role        0
Features    0
dtype: int64
Cleaned dataset:
                        Role  \
0       Social Media Manager   
1     Frontend Web Developer   
2    Quality Control Manager   
3  Wireless Network Engineer   
4         Conference Manager   

                                            Features  
0  5 to 15 Years Digital Marketing Specialist M.T...  
1  2 to 12 Years Web Developer BCA HTML, CSS, Jav...  
2  0 to 12 Years Operations Manager PhD Quality c...  
3  4 to 11 Years Network Engineer PhD Wireless ne...  
4  1 to 12 Years Event Manager MBA Event planning...  


In [4]:
import re
import string

def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = text.strip()
    return text

# Apply the cleaning function
df['Cleaned_Features'] = df['Features'].apply(clean_text)

print(df[['Role', 'Cleaned_Features']].head())


                        Role  \
0       Social Media Manager   
1     Frontend Web Developer   
2    Quality Control Manager   
3  Wireless Network Engineer   
4         Conference Manager   

                                    Cleaned_Features  
0  to  years digital marketing specialist mtech s...  
1  to  years web developer bca html css javascrip...  
2  to  years operations manager phd quality contr...  
3  to  years network engineer phd wireless networ...  
4  to  years event manager mba event planning con...  


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform the cleaned features
X = vectorizer.fit_transform(df['Cleaned_Features'])

# Target labels (Job Roles)
y = df['Role']

# Check feature matrix shape
print(f"Shape of feature matrix: {X.shape}")
print(f"Sample features: {vectorizer.get_feature_names_out()[:10]}")


Shape of feature matrix: (1615875, 1977)
Sample features: ['ab' 'abilities' 'ability' 'abnormalities' 'abuse' 'academic' 'accepted'
 'access' 'accessibility' 'accessible']


In [6]:
from sklearn.model_selection import train_test_split

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (1292700, 1977)
Testing set size: (323175, 1977)


In [7]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)


In [8]:
import joblib

In [9]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
joblib.dump(rf_model, 'rf_model.pkl')


Accuracy: 1.00
Classification Report:
                                          precision    recall  f1-score   support

                           API Developer       1.00      1.00      1.00       695
                 Accessibility Developer       1.00      1.00      1.00       689
                       Account Executive       1.00      1.00      1.00      1369
                         Account Manager       1.00      1.00      1.00       706
                      Account Strategist       1.00      1.00      1.00       665
                   Accounting Controller       1.00      1.00      1.00       672
                      Accounting Manager       1.00      1.00      1.00       673
           Acute Care Nurse Practitioner       1.00      1.00      1.00       716
                     Addiction Counselor       1.00      1.00      1.00       686
                Administrative Assistant       1.00      1.00      1.00       750
              Administrative Coordinator       1.00      1.

['rf_model.pkl']

In [10]:
import joblib
joblib.dump(vectorizer, 'job_role_tfidf_vectorizer1.pkl')

['job_role_tfidf_vectorizer1.pkl']

In [11]:
# Save the trained model
joblib.dump(rf_model, 'job_role_classifier1JR.pkl')

['job_role_classifier1JR.pkl']

In [12]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3


In [13]:
import fitz  # PyMuPDF
import re
import joblib
import numpy as np

# Load the trained model and vectorizer
rf_model = joblib.load('job_role_classifier1JR.pkl')
vectorizer = joblib.load('job_role_tfidf_vectorizer1.pkl')

# Get all job roles
job_roles = rf_model.classes_

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Clean text (basic preprocessing)
def clean_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    text = re.sub(r'\d+', '', text)    # Remove numbers
    return text.lower().strip()

# Predict top 5 job roles
def predict_top_5_job_roles(cv_pdf):
    cv_text = extract_text_from_pdf(cv_pdf)
    cleaned_text = clean_text(cv_text)

    # Transform the text using the TF-IDF vectorizer
    cv_tfidf = vectorizer.transform([cleaned_text])

    # Get prediction probabilities
    probs = rf_model.predict_proba(cv_tfidf)[0]

    # Get top 5 job roles based on probabilities
    top_5_indices = np.argsort(probs)[-5:][::-1]
    top_5_roles = [(job_roles[i], probs[i]) for i in top_5_indices]

    return top_5_roles

# Run prediction
cv_pdf_path = ''
suggested_roles = predict_top_5_job_roles(cv_pdf_path)

# Display suggestions
print("Top 5 Suggested Job Roles:")
for role, prob in suggested_roles:
    print(f"{role} (Confidence: {prob:.2%})")


Top 5 Suggested Job Roles:
Clinical Nurse Manager (Confidence: 8.00%)
International Tax Consultant (Confidence: 7.00%)
Lighting Designer (Confidence: 6.00%)
Inventory Control Specialist (Confidence: 5.00%)
Environmental Consultant (Confidence: 5.00%)


In [14]:
import joblib

# Load the trained model and vectorizer
rf_model = joblib.load('job_role_classifier1JR.pkl')
vectorizer = joblib.load('job_role_tfidf_vectorizer1.pkl')

# Combine both into a single dictionary
combined_job_role_model = {
    'model': rf_model,
    'vectorizer': vectorizer
}

# Save the combined model
joblib.dump(combined_job_role_model, 'job_role_classifier_combined.pkl')

print("Combined job role classifier and vectorizer saved as 'job_role_classifier_combined.pkl'.")


Combined job role classifier and vectorizer saved as 'job_role_classifier_combined.pkl'.
