In [None]:
!pip install python-pdfbox
!pip install pyLDAvis
!pip install skillNer

from google.colab import drive
drive.mount('/content/drive')


import csv
import os
import glob
import re
import nltk
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy.tokens import Doc
import gensim
from gensim import corpora
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

from pdfbox import PDFBox
import hashlib # file ids
import spacy.cli
spacy.cli.download("en_core_web_lg")
#warning
import warnings 
warnings.filterwarnings('ignore')

nltk.download(['stopwords','wordnet'])

nltk.download('omw-1.4')
nlp=spacy.load('en_core_web_lg')


def extract_pdf_text(folder_path):
    text_list = []

    for file in glob.glob(os.path.join(folder_path, "*.pdf")):
        file_name = os.path.basename(file)
        pdf = PDFBox()
        text = pdf.extract_text(file)
        if text:
            text_list.append({"file_name": file_name, "text": text})
        else:
            with open(f"{file_name}.txt", "w") as f:
                f.write("")

    

def text_to_csv(folder_path):
    csv_file = open('text_files.csv', 'w', newline='')
    writer = csv.writer(csv_file)
    writer.writerow(['ID', 'filename', 'text'])

    def generate_id(filename):
        return int(hashlib.md5(filename.encode('utf-8')).hexdigest(), 16)% 100000 #to limit the unique ID to less or equal to 5 digits
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                text = file.read()
                id = generate_id(filename)
                writer.writerow([id, filename, text])

    csv_file.close()

def load_csv(file_path):
    df = pd.read_csv(file_path)
    return df

from spacy.matcher import PhraseMatcher

# load default skills data base
from skillNer.general_params import SKILL_DB
# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)

def extract_skills(text):
    
    annotations = skill_extractor.annotate(text)
    doc_node_values = set([x['doc_node_value'] for x in annotations['results']['full_matches']])
    return ', '.join(doc_node_values)

def create_wordcloud(df, column_name):
    skill_cloud = ""
    for i in df[column_name].values:
        skill_cloud += i + " "

    plt.figure(figsize=(8, 8))

    x, y = np.ogrid[:300, :300]

    mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
    mask = 255 * mask.astype(int)

    wc = WordCloud(
        width=800,
        height=800,
        background_color="white",
        min_font_size=6,
        repeat=True,
        mask=mask,
    )
    wc.generate(skill_cloud)

    plt.axis("off")
    plt.imshow(wc, interpolation="bilinear")
    plt.title("Most Used Skill Words in Resume", fontsize=20)


def plot_skills_histogram(df):
    df_skills = df.copy()
    df_skills['skills'] = df_skills['skills'].str.split(',')
    df_skills = df_skills.set_index(['ID'])['skills'].apply(pd.Series).stack()
    df_skills = df_skills.reset_index()
    df_skills = df_skills.rename(columns={0: "skill"})
    df_skills['skill'].value_counts().plot(kind='bar')
    plt.xlabel("Skills")
    plt.ylabel("Number of Candidates")
    plt.title("Skills Histogram")
    plt.show()


folder_path = "/content/drive/MyDrive/Uploaded Resumes/Business Analytics"


# To extract text from pdf files
extract_pdf_text(folder_path)


# To create a csv file from the extracted text
text_to_csv(folder_path)

# To load the csv file
file_path = 'text_files.csv'
df = load_csv(file_path)



# To preprocess the text in the csv file
clean = []
for i in range(df.shape[0]):
    review = re.sub(
        '(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"',
        " ",
        df["text"].iloc[i],
    )
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [
        lm.lemmatize(word)
        for word in review
        if not word in set(stopwords.words("english"))
    ]
    review = " ".join(review)
    clean.append(review)

df["Clean_Resume"] = clean

df['skills'] = df['Clean_Resume'].apply(extract_skills)
df.head()

create_wordcloud(df, "skills")
def rank_resumes(df, keywords, scores):
    scores_dict = dict(zip(keywords, scores))
    df["rank"] = 0
    for i in range(df.shape[0]):
        for keyword in keywords:
            if keyword in df["skills"].iloc[i]:
                df["rank"].iloc[i] += scores_dict[keyword]
    df = df.sort_values("rank", ascending=False)
    return df
keywords = ['business development','machine learn', 'time management']# key words to be searched in skills
scores = [2, 3]# scores for the above skills in respective order
df = rank_resumes(df, keywords, scores)
df.to_csv('/content/drive/MyDrive/Uploaded Resumes/ranked_resumesBA.csv', index=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...
