In [None]:
!pip install --upgrade PyPDF2 # Upgrade PyPDF2 to the latest version

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import PyPDF2  # Import PyPDF2 for reading PDF files

# Load the training data
train_data = pd.read_csv('train.csv')

# Handle missing values (replace NaN with empty strings)
train_data['Resumes'] = train_data['Resumes'].fillna('')
train_data['JD'] = train_data['JD'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the resumes and job descriptions
# Concatenate resumes and JDs after handling missing values
vectorizer.fit(train_data['Resumes'].tolist() + train_data['JD'].tolist())

# Transform the resumes and job descriptions into TF-IDF vectors
resume_vectors = vectorizer.transform(train_data['Resumes'])
jd_vectors = vectorizer.transform(train_data['JD'])

# Create a feature matrix by concatenating the resume and job description vectors
# Use hstack to concatenate sparse matrices
X = hstack([resume_vectors, jd_vectors])

# Create a target vector
y = train_data['Result']

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Evaluate the model
y_pred = rf.predict(X)
print("Evaluation of the model:")
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:")
print(classification_report(y, y_pred))

# Function to read PDF file and extract text
def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    # Use PdfReader instead of PdfFileReader
    read_pdf = PyPDF2.PdfReader(pdf_file)
    # Use len(read_pdf.pages) to get the number of pages
    number_of_pages = len(read_pdf.pages)
    text = ''
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        text += page_content
    return text

# Function to predict the match result
def predict_match(resume_file, jd_file):
    if resume_file.endswith('.pdf'):
        resume_text = read_pdf(resume_file)
    else:
        # Assuming CSV files have a 'Resume' column
        resume_text = pd.read_csv(resume_file)['Resume'].iloc[0]

    if jd_file.endswith('.pdf'):
        jd_text = read_pdf(jd_file)
    else:
        # Assuming CSV files have a 'JD' column
        jd_text = pd.read_csv(jd_file)['JD'].iloc[0]

    resume_vector = vectorizer.transform([resume_text])
    jd_vector = vectorizer.transform([jd_text])
    feature_vector = hstack([resume_vector, jd_vector])
    return rf.predict(feature_vector)[0]

# Test the model (replace with actual file paths)
resume_file = 'resume.pdf'
jd_file = 'jd.pdf'
print("Test data result :")
print(predict_match(resume_file, jd_file))

Evaluation of the model:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        42

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

Test data result :
1


In [16]:
!pip install --upgrade PyPDF2 # Upgrade PyPDF2 to the latest version

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import PyPDF2  # Import PyPDF2 for reading PDF files
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Load the training data
train_data = pd.read_csv('train.csv')

# Handle missing values (replace NaN with empty strings)
train_data['Resumes'] = train_data['Resumes'].fillna('')
train_data['JD'] = train_data['JD'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the resumes and job descriptions
# Concatenate resumes and JDs after handling missing values
vectorizer.fit(train_data['Resumes'].tolist() + train_data['JD'].tolist())

# Transform the resumes and job descriptions into TF-IDF vectors
resume_vectors = vectorizer.transform(train_data['Resumes'])
jd_vectors = vectorizer.transform(train_data['JD'])

# Create a feature matrix by concatenating the resume and job description vectors
# Use hstack to concatenate sparse matrices
X = hstack([resume_vectors, jd_vectors])

# Calculate the percentage match for each resume and JD
def calculate_percentage_match(resume_text, jd_text):
    resume_words = set(word_tokenize(resume_text.lower()))
    jd_words = set(word_tokenize(jd_text.lower()))
    common_words = resume_words & jd_words
    percentage_match = len(common_words) / len(jd_words) * 100
    return percentage_match

percentage_matches = [calculate_percentage_match(resume, jd) for resume, jd in zip(train_data['Resumes'], train_data['JD'])]

# Add the percentage match feature to the feature matrix
X = hstack([X, np.array(percentage_matches).reshape(-1, 1)])

# Create a target vector
y = train_data['Result']

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Evaluate the model
y_pred = rf.predict(X)
print("Evaluation of the model:")
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:")
print(classification_report(y, y_pred))

# Function to read PDF file and extract text
def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    # Use PdfReader instead of PdfFileReader
    read_pdf = PyPDF2.PdfReader(pdf_file)
    # Use len(read_pdf.pages) to get the number of pages
    number_of_pages = len(read_pdf.pages)
    text = ''
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        text += page_content
    return text

# Function to predict the match result
def predict_match(resume_file, jd_file):
    if resume_file.endswith('.pdf'):
        resume_text = read_pdf(resume_file)
    else:
        # Assuming CSV files have a 'Resume' column
        resume_text = pd.read_csv(resume_file)['Resume'].iloc[0]

    if jd_file.endswith('.pdf'):
        jd_text = read_pdf(jd_file)
    else:
        # Assuming CSV files have a 'JD' column
        jd_text = pd.read_csv(jd_file)['JD'].iloc[0]

    resume_vector = vectorizer.transform([resume_text])
    jd_vector = vectorizer.transform([jd_text])
    percentage_match = calculate_percentage_match(resume_text, jd_text)
    feature_vector = hstack([resume_vector, jd_vector, np.array([percentage_match]).reshape(1, -1)])
    return rf.predict(feature_vector)[0]

# Test the model (replace with actual file paths)
resume_file = 'resume.pdf'
jd_file = 'jd.pdf'
print("Test data result :")
print(predict_match(resume_file, jd_file))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


ZeroDivisionError: division by zero

In [2]:
!pip install --upgrade PyPDF2 # Upgrade PyPDF2 to the latest version

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import PyPDF2  # Import PyPDF2 for reading PDF files

# Load the training data
train_data = pd.read_csv('train.csv')

# Handle missing values (replace NaN with empty strings)
train_data['Resumes'] = train_data['Resumes'].fillna('')
train_data['JD'] = train_data['JD'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the resumes and job descriptions
# Concatenate resumes and JDs after handling missing values
vectorizer.fit(train_data['Resumes'].tolist() + train_data['JD'].tolist())

# Transform the resumes and job descriptions into TF-IDF vectors
resume_vectors = vectorizer.transform(train_data['Resumes'])
jd_vectors = vectorizer.transform(train_data['JD'])

# Calculate the similarity score between resume and JD vectors
similarity_scores = cosine_similarity(resume_vectors, jd_vectors)

# Calculate the percentage match
percentage_match = similarity_scores * 100

# Create a feature matrix by concatenating the resume and job description vectors
# Use hstack to concatenate sparse matrices
X = hstack([resume_vectors, jd_vectors, percentage_match])

# Create a target vector
y = train_data['Result']

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Evaluate the model
y_pred = rf.predict(X)
print("Evaluation of the model:")
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:")
print(classification_report(y, y_pred))

# Function to read PDF file and extract text
def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    # Use PdfReader instead of PdfFileReader
    read_pdf = PyPDF2.PdfReader(pdf_file)
    # Use len(read_pdf.pages) to get the number of pages
    number_of_pages = len(read_pdf.pages)
    text = ''
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        text += page_content
    return text

# Function to predict the match result
def predict_match(resume_file, jd_file):
    if resume_file.endswith('.pdf'):
        resume_text = read_pdf(resume_file)
    else:
        # Assuming CSV files have a 'Resume' column
        resume_text = pd.read_csv(resume_file)['Resume'].iloc[0]

    if jd_file.endswith('.pdf'):
        jd_text = read_pdf(jd_file)
    else:
        # Assuming CSV files have a 'JD' column
        jd_text = pd.read_csv(jd_file)['JD'].iloc[0]

    resume_vector = vectorizer.transform([resume_text])
    jd_vector = vectorizer.transform([jd_text])

    # Calculate the similarity score
    similarity_score = cosine_similarity(resume_vector, jd_vector)[0][0]

    # Calculate the percentage match
    percentage_match = similarity_score * 100

    feature_vector = hstack([resume_vector, jd_vector, percentage_match])
    return rf.predict(feature_vector)[0]

# Test the model (replace with actual file paths)
resume_file = 'resume.pdf'
jd_file = 'jd.pdf'
print("Test data result :")
print(predict_match(resume_file, jd_file))



FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [3]:
!pip install --upgrade PyPDF2 # Upgrade PyPDF2 to the latest version

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import PyPDF2  # Import PyPDF2 for reading PDF files

# Load the training data
# Replace '/path/to/your/train.csv' with the actual path to your file
train_data = pd.read_csv('/path/to/your/train.csv')

# Handle missing values (replace NaN with empty strings)
train_data['Resumes'] = train_data['Resumes'].fillna('')
train_data['JD'] = train_data['JD'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the resumes and job descriptions
# Concatenate resumes and JDs after handling missing values
vectorizer.fit(train_data['Resumes'].tolist() + train_data['JD'].tolist())

# Transform the resumes and job descriptions into TF-IDF vectors
resume_vectors = vectorizer.transform(train_data['Resumes'])
jd_vectors = vectorizer.transform(train_data['JD'])

# Calculate the similarity score between resume and JD vectors
similarity_scores = cosine_similarity(resume_vectors, jd_vectors)

# Calculate the percentage match
percentage_match = similarity_scores * 100

# Create a feature matrix by concatenating the resume and job description vectors
# Use hstack to concatenate sparse matrices
X = hstack([resume_vectors, jd_vectors, percentage_match])

# Create a target vector
y = train_data['Result']

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Evaluate the model
y_pred = rf.predict(X)
print("Evaluation of the model:")
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:")
print(classification_report(y, y_pred))

# Function to read PDF file and extract text
def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    # Use PdfReader instead of PdfFileReader
    read_pdf = PyPDF2.PdfReader(pdf_file)
    # Use len(read_pdf.pages) to get the number of pages
    number_of_pages = len(read_pdf.pages)
    text = ''
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        text += page_content
    return text

# Function to predict the match result
def predict_match(resume_file, jd_file):
    if resume_file.endswith('.pdf'):
        resume_text = read_pdf(resume_file)
    else:
        # Assuming CSV files have a 'Resume' column
        resume_text = pd.read_csv(resume_file)['Resume'].iloc[0]

    if jd_file.endswith('.pdf'):
        jd_text = read_pdf(jd_file)
    else:
        # Assuming CSV files have a 'JD' column
        jd_text = pd.read_csv(jd_file)['JD'].iloc[0]

    resume_vector = vectorizer.transform([resume_text])
    jd_vector = vectorizer.transform([jd_text])

    # Calculate the similarity score
    similarity_score = cosine_similarity(resume_vector, jd_vector)[0][0]

    # Calculate the percentage match
    percentage_match = similarity_score * 100

    feature_vector = hstack([resume_vector, jd_vector, percentage_match])
    return rf.predict(feature_vector)[0]

# Test the model (replace with actual file paths)
resume_file = 'resume.pdf'
jd_file = 'jd.pdf'
print("Test data result :")
print(predict_match(resume_file, jd_file))



FileNotFoundError: [Errno 2] No such file or directory: '/path/to/your/train.csv'

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import PyPDF2  # Import PyPDF2 for reading PDF files

# Load the training data
train_data = pd.read_csv('train.csv')  # Replace with the actual path to your file

# Handle missing values (replace NaN with empty strings)
train_data['Resumes'] = train_data['Resumes'].fillna('')
train_data['JD'] = train_data['JD'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the resumes and job descriptions
# Concatenate resumes and JDs after handling missing values
vectorizer.fit(train_data['Resumes'].tolist() + train_data['JD'].tolist())

# Transform the resumes and job descriptions into TF-IDF vectors
resume_vectors = vectorizer.transform(train_data['Resumes'])
jd_vectors = vectorizer.transform(train_data['JD'])

# Calculate the similarity score between resume and JD vectors
similarity_scores = cosine_similarity(resume_vectors, jd_vectors)

# Calculate the percentage match
percentage_match = similarity_scores * 100

# Create a feature matrix by concatenating the resume and job description vectors
# Use hstack to concatenate sparse matrices
X = hstack([resume_vectors, jd_vectors, percentage_match])

# Create a target vector
y = train_data['Result']

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Evaluate the model
y_pred = rf.predict(X)
print("Evaluation of the model:")
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:")
print(classification_report(y, y_pred))

# Function to read PDF file and extract text
def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    # Use PdfReader instead of PdfFileReader
    read_pdf = PyPDF2.PdfReader(pdf_file)
    # Use len(read_pdf.pages) to get the number of pages
    number_of_pages = len(read_pdf.pages)
    text = ''
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        text += page_content
    return text

# Function to predict the match result
def predict_match(resume_file, jd_file):
    if resume_file.endswith('.pdf'):
        resume_text = read_pdf(resume_file)
    else:
        # Assuming CSV files have a 'Resume' column
        resume_text = pd.read_csv(resume_file)['Resume'].iloc[0]

    if jd_file.endswith('.pdf'):
        jd_text = read_pdf(jd_file)
    else:
        # Assuming CSV files have a 'JD' column
        jd_text = pd.read_csv(jd_file)['JD'].iloc[0]

    # Transform the resume and JD texts into TF-IDF vectors
    resume_vector = vectorizer.transform([resume_text])
    jd_vector = vectorizer.transform([jd_text])

    # Calculate the similarity score
    similarity_score = cosine_similarity(resume_vector, jd_vector)[0][0]

    # Calculate the percentage match
    percentage_match = similarity_score * 100

    # Create a feature matrix by concatenating the resume and job description vectors
    feature_vector = hstack([resume_vector, jd_vector, [[percentage_match]]])

    # Reshape the feature vector to match the expected input shape
    feature_vector = feature_vector.reshape(1, -1)
    feature_vector = hstack([resume_vector, jd_vector, [[percentage_match]]])
    print(feature_vector.shape)  # Check the shape of the feature vector
    return rf.predict(feature_vector)[0]
resume_file = 'resume.pdf'
jd_file = 'jd.pdf'
print("Test data result :")
print(predict_match(resume_file, jd_file))

Evaluation of the model:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        42

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

Test data result :
(1, 10001)


ValueError: X has 10001 features, but RandomForestClassifier is expecting 10084 features as input.

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import PyPDF2

# Load the training data
train_data = pd.read_csv('train.csv')

# Handle missing values (replace NaN with empty strings)
train_data['Resumes'] = train_data['Resumes'].fillna('')
train_data['JD'] = train_data['JD'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the resumes and job descriptions
vectorizer.fit(train_data['Resumes'].tolist() + train_data['JD'].tolist())

# Transform the resumes and job descriptions into TF-IDF vectors
resume_vectors = vectorizer.transform(train_data['Resumes'])
jd_vectors = vectorizer.transform(train_data['JD'])

# Create a feature matrix by concatenating the resume and job description vectors
X = hstack([resume_vectors, jd_vectors])

# Create a target vector
y = train_data['Result']

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Evaluate the model
y_pred = rf.predict(X)
print("Evaluation of the model:")
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:")
print(classification_report(y, y_pred))

# Function to read PDF file and extract text
def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    read_pdf = PyPDF2.PdfReader(pdf_file)
    number_of_pages = len(read_pdf.pages)
    text = ''
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        text += page_content
    return text

# Function to calculate the percentage match between resume and JD
def calculate_percentage_match(resume_text, jd_text):
    resume_words = set(resume_text.split())
    jd_words = set(jd_text.split())
    common_words = resume_words & jd_words
    percentage_match = len(common_words) / len(jd_words) * 100
    return percentage_match

# Function to predict the match result
def predict_match(resume_file, jd_file):
    if resume_file.endswith('.pdf'):
        resume_text = read_pdf(resume_file)
    else:
        resume_text = pd.read_csv(resume_file)['Resume'].iloc[0]

    if jd_file.endswith('.pdf'):
        jd_text = read_pdf(jd_file)
    else:
        jd_text = pd.read_csv(jd_file)['JD'].iloc[0]

    resume_vector = vectorizer.transform([resume_text])
    jd_vector = vectorizer.transform([jd_text])
    percentage_match = calculate_percentage_match(resume_text, jd_text)
    feature_vector = hstack([resume_vector, jd_vector, [[percentage_match]]])
    return rf.predict(feature_vector)[0]

# Test the model (replace with actual file paths)
resume_file = 'resume.pdf'
jd_file = 'jd.pdf'
print("Test data result :")
print(predict_match(resume_file, jd_file))

Evaluation of the model:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        42

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

Test data result :


ZeroDivisionError: division by zero

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
import PyPDF2
import numpy as np

# Load the training data
train_data = pd.read_csv('train.csv')

# Handle missing values (replace NaN with empty strings)
train_data['Resumes'] = train_data['Resumes'].fillna('')
train_data['JD'] = train_data['JD'].fillna('')

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer to the resumes and job descriptions
vectorizer.fit(train_data['Resumes'].tolist() + train_data['JD'].tolist())

# Transform the resumes and job descriptions into TF-IDF vectors
resume_vectors = vectorizer.transform(train_data['Resumes'])
jd_vectors = vectorizer.transform(train_data['JD'])

# Create a feature matrix by concatenating the resume and job description vectors
X = hstack([resume_vectors, jd_vectors])

# Create a target vector
y = train_data['Result']

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Evaluate the model
y_pred = rf.predict(X)
print("Evaluation of the model:")
print("Accuracy:", accuracy_score(y, y_pred))
print("Classification Report:")
print(classification_report(y, y_pred))

# Function to read PDF file and extract text
def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    read_pdf = PyPDF2.PdfReader(pdf_file)
    number_of_pages = len(read_pdf.pages)
    text = ''
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        text += page_content
    return text

# Function to calculate the percentage match between resume and JD
def calculate_percentage_match(resume_text, jd_text):
    resume_words = set(resume_text.split())
    jd_words = set(jd_text.split())
    common_words = resume_words & jd_words
    if len(jd_words) == 0:
        percentage_match = 0
    else:
        percentage_match = len(common_words) / len(jd_words) * 100
    return percentage_match

# Function to predict the match result
def predict_match(resume_file, jd_file):
    if resume_file.endswith('.pdf'):
        resume_text = read_pdf(resume_file)
    else:
        resume_text = pd.read_csv(resume_file)['Resume'].iloc[0]

    if jd_file.endswith('.pdf'):
        jd_text = read_pdf(jd_file)
    else:
        jd_text = pd.read_csv(jd_file)['JD'].iloc[0]
        resume_vector = vectorizer.transform([resume_text])
    jd_vector = vectorizer.transform([jd_text])
    percentage_match = calculate_percentage_match(resume_text, jd_text)

    # Create the feature vector with the correct number of features (10000)
    feature_vector = hstack([resume_vector, jd_vector, [[percentage_match]]])

    return rf.predict(feature_vector)[0]

# Test the model (replace with actual file paths)
resume_file = 'resume.pdf'
jd_file = 'jd.pdf'
print("Test data result :")
print(predict_match(resume_file, jd_file))

Evaluation of the model:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        42

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

Test data result :


UnboundLocalError: local variable 'resume_vector' referenced before assignment