In [1]:
import os
import re
import requests
import sys
import json
import google.generativeai as genai
import pandas as pd
from pdfminer.high_level import extract_text
from docx import Document

In [2]:
# Function to preprocess the extracted text
def preprocess_text(text):
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove any non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    return text

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    return extract_text(file_path)

# Function to extract text from a DOCX file
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

# Function to parse the extracted text
def parse_extracted_text(text):
    details = {}
    lines = text.strip().split('\n')
    for line in lines:
        if ':**' in line:
            key, value = line.split(':**', 1)
            key = key.replace('* **', '').strip()
            value = value.replace('**', '').strip()
            details[key] = value
    return details


In [3]:
# Directory containing the resumes
directory = "Resumes/"

# Initialize a list to hold all extracted details
all_extracted_details = []


In [4]:
# Configure Google Generative AI
genai.configure(api_key="AIzaSyA4YgbC_RYiUyeuXo-wOI5DOehZpZvjnJk")
model = genai.GenerativeModel(model_name='gemini-1.5-flash')

In [5]:
import os
import pandas as pd
import io

# Initialize an empty list to store all extracted details
all_extracted_details = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)

    # Check if the file is a PDF or DOCX
    if filename.endswith('.pdf'):
        text = extract_text_from_pdf(file_path)
    elif filename.endswith('.docx'):
        text = extract_text_from_docx(file_path)
    else:
        print(f"Unsupported file type for file: {filename}. Skipping.")
        continue

    # Preprocess the extracted text
    processed_text = preprocess_text(text)

    prompt = f"""
    Extract the following information from the resume: 
    - Name
    - Email
    - Phone
    - Date of Birth (DOB)
    - College
    - City
    - Stream of Work
    - Top Skills, Tope most skill)
    - Experience
    - College passed out year

    If any field is unavailable, use 'NA'. Present the results in this uniform format:
    {{
        "Name": "<Name>",
        "Email": "<Email>",
        "Phone": "<Phone>",
        "DOB": "<DOB>",
        "College": "<College>",
        "City": "<City>",
        "Stream of Work": "<Stream of Work>",
        "Top Skills": "<Skill1>",
        "Experience": "<Experience>"
        "Passed Out Year": "<College passed out year>"
    }}
    """

    # Create a StringIO object to capture the model's response
    captured_output = io.StringIO()
    
    # Generate response from the model
    response = model.generate_content([processed_text, prompt])
    
    # Capture the model response
    print(response.text.strip(), file=captured_output)  # Capture output in the StringIO object
    captured_response_text = captured_output.getvalue()  # Get the captured text
    print(f"Raw model response for {filename}:\n{captured_response_text}\n")  # Print the raw response
    
    # Initialize default extracted details
    extracted_details = {
        "Name": "NA",
        "Email": "NA",
        "Phone": "NA",
        "DOB": "NA",
        "College": "NA",
        "City": "NA",
        "Stream of Work": "NA",
        "Top Skills": "NA",  # Store as a single string
        "Experience": "NA",
        "Passed Out Year": "NA"
    }
    
    # Only proceed if the captured response text is not empty
    if captured_response_text:
        # Split the captured text by lines and manually extract values
        lines = captured_response_text.strip().splitlines()
        for line in lines:
            if "Name" in line:
                extracted_details["Name"] = line.split(":")[-1].strip().strip('"')
            elif "Email" in line:
                extracted_details["Email"] = line.split(":")[-1].strip().strip('"')
            elif "Phone" in line:
                extracted_details["Phone"] = line.split(":")[-1].strip().strip('"')
            elif "DOB" in line:
                extracted_details["DOB"] = line.split(":")[-1].strip().strip('"')
            elif "College" in line:
                extracted_details["College"] = line.split(":")[-1].strip().strip('"')
            elif "City" in line:
                extracted_details["City"] = line.split(":")[-1].strip().strip('"')
            elif "Stream of Work" in line:
                extracted_details["Stream of Work"] = line.split(":")[-1].strip().strip('"')
            elif "Top Skills" in line:
                # Extract the skills as a single string
                extracted_details["Top Skills"] = line.split(":")[-1].strip().strip('"')
            elif "Experience" in line:
                extracted_details["Experience"] = line.split(":")[-1].strip().strip('"')
            elif "Passed Out Year" in line:
                extracted_details["Passed Out Year"] = line.split(":")[-1].strip().strip('"')
    
    # Append the extracted details to the list
    all_extracted_details.append(extracted_details)


Unsupported file type for file: .ipynb_checkpoints. Skipping.
Raw model response for Chitturi Prasad.pdf:
```json
{
    "Name": "Chitturi Prasad",
    "Email": "chprasad@kluniversity.in",
    "Phone": "9700674515",
    "DOB": "NA",
    "College": "KL University",
    "City": "NA",
    "Stream of Work": "Assistant Professor",
    "Top Skills": "Java",
    "Experience": "Working as Assistant Professor in KL University from July 2019.",
    "Passed Out Year": "2017"
}
```


Unsupported file type for file: extracted_details.csv. Skipping.
Unsupported file type for file: extracted_details.json. Skipping.
Unsupported file type for file: extracted_details_unique.csv. Skipping.
Unsupported file type for file: extracted_resume_details.csv. Skipping.
Unsupported file type for file: extracted_resume_details1.csv. Skipping.
Unsupported file type for file: extracted_resume_details_filtered.csv. Skipping.
Raw model response for Kumar Saurav.pdf:
```json
{
    "Name": "Kumar Saurav",
    "Email": "ku

In [18]:
# Convert the list of extracted details into a DataFrame
df = pd.DataFrame(all_extracted_details)

# Display the DataFrame
print(df)

# Optionally, save the DataFrame to a CSV file
df.to_csv("Resumes/Sample/extracted_resume_details.csv", index=False)

                 Name                             Email             Phone  \
0   Chitturi Prasad",        chprasad@kluniversity.in",      9700674515",   
1      Kumar Saurav",  kumarsauravsmart2010@gmail.com",  +91-8420538839",   
2  Rachelle Beaudry",       hello@reallygreatsite.com",    123-456-7890",   
3    SAGAR BHANDARI",     sbhandari1@stcloudstate.edu",              NA",   

    DOB                                            College           City  \
0  NA",                                    KL University",           NA",   
1  NA",  Indian Institute of Engineering Science and Te...      Kolkata",   
2  NA",                                     City College",     Any City",   
3  NA",                       St. Cloud State University",  St Cloud,MN",   

           Stream of Work            Top Skills  \
0   Assistant Professor",                Java",   
1  Software Development",                Java",   
2            Accounting",  Financial Analysis",   
3   Information Systems"

In [19]:
print(df.columns.tolist())

['Name', 'Email', 'Phone', 'DOB', 'College', 'City', 'Stream of Work', 'Top Skills', 'Experience', 'Passed Out Year']


In [20]:
df.columns = df.columns.str.strip()  # Removes leading and trailing spaces


In [21]:
# Extract unique nodes
unique_names = df["Name"].unique().tolist()
unique_skills = df["Top Skills"].unique().tolist()
unique_colleges = df["College"].unique().tolist()
unique_years = df["Passed Out Year"].unique().tolist()

# Find the maximum length for consistent DataFrame creation
max_length = max(len(unique_names), len(unique_skills), len(unique_colleges), len(unique_years))

# Create a dictionary with the maximum length and fill missing values with NaN
nodes_dict = {
    "name": unique_names + [None] * (max_length - len(unique_names)),
    "skills": unique_skills + [None] * (max_length - len(unique_skills)),
    "college": unique_colleges + [None] * (max_length - len(unique_colleges)),
    "year_of_pass_out": unique_years + [None] * (max_length - len(unique_years))
}


# Create the nodes DataFrame
nodes_df = pd.DataFrame(nodes_dict)

# Display the nodes DataFrame
print(nodes_df)

                 name                skills  \
0   Chitturi Prasad",                Java",   
1      Kumar Saurav",  Financial Analysis",   
2  Rachelle Beaudry",                 SQL",   
3    SAGAR BHANDARI",                  None   

                                             college year_of_pass_out  
0                                    KL University",             2017  
1  Indian Institute of Engineering Science and Te...             2018  
2                                     City College",             2020  
3                       St. Cloud State University",             None  


In [42]:
# Rename columns to match Dgraph schema
nodes_df.rename(columns={
    "Name": "name",
    "Skills": "skills",  # Rename to match the schema as it's a list
    "College": "college",
    "Passed Out Year": "year_of_pass_out"
}, inplace=True)

# Print DataFrame to check the structure
print("\nRevised DataFrame structure:")
print(nodes_df.head())
print(nodes_df.columns)


Revised DataFrame structure:
                 name                skills  \
0   Chitturi Prasad",                Java",   
1      Kumar Saurav",  Financial Analysis",   
2  Rachelle Beaudry",                 SQL",   
3    SAGAR BHANDARI",                  None   

                                             college year_of_pass_out  
0                                    KL University",             2017  
1  Indian Institute of Engineering Science and Te...             2018  
2                                     City College",             2020  
3                       St. Cloud State University",             None  
Index(['name', 'skills', 'college', 'year_of_pass_out'], dtype='object')


In [43]:
import pydgraph

In [44]:
import grpc
from pydgraph import DgraphClient, DgraphClientStub

In [45]:
DGRAPH_ENDPOINT = "https://green-feather-41421546.ap-south-1.aws.cloud.dgraph.io/graphql"
# Dgraph GraphQL endpoint

In [68]:
import requests

DGRAPH_ENDPOINT = "https://green-feather-41421546.ap-south-1.aws.cloud.dgraph.io/graphql"

# Function to insert candidate data
def mutate_candidate_data(name, college, year_of_pass_out, skills):
    # Construct the mutation string
    mutation = f'''
    mutation {{
        addCandidate(input: {{
            name: "{name.strip()}"
            college: {{
                name: "{college.strip()}"
            }}
            year_of_pass_out: {year_of_pass_out if year_of_pass_out is not None else 'null'}
            skills: [{', '.join([f'{{name: "{skill.strip()}"}}' for skill in skills])}]
        }}) {{
            # Note: Remove 'id' from the query if it causes an error
            candidate {{
                name
            }}
        }}
    }}
    '''

    print("Mutation query:", mutation)  # Debug output to see the mutation query

    # Send the mutation request
    response = requests.post(DGRAPH_ENDPOINT, json={'query': mutation})

    # Check the response and print success or error
    if response.status_code == 200:
        print(f"Successfully inserted data for {name.strip()}")
        print("Response data:", response.json())  # Print the response for debugging
    else:
        print(f"Error inserting data for {name.strip()}: {response.json()}")

# Example usage
mutate_candidate_data("Chitturi Prasad", "KL University", 2017, ["Java"])
mutate_candidate_data("Kumar Saurav", "Indian Institute of Engineering Science and Technology, Shibpur", 2018, ["Financial Analysis"])
mutate_candidate_data("Rachelle Beaudry", "City College", 2020, ["SQL"])


Mutation query: 
    mutation {
        addCandidate(input: {
            name: "Chitturi Prasad"
            college: {
                name: "KL University"
            }
            year_of_pass_out: 2017
            skills: [{name: "Java"}]
        }) {
            # Note: Remove 'id' from the query if it causes an error
            candidate {
                name
            }
        }
    }
    
Successfully inserted data for Chitturi Prasad
Response data: {'data': {'addCandidate': {'candidate': [{'name': 'Chitturi Prasad'}]}}, 'extensions': {'touched_uids': 15, 'tracing': {'version': 1, 'startTime': '2024-10-03T17:26:42.030380667Z', 'endTime': '2024-10-03T17:26:42.241710991Z', 'duration': 211330434, 'execution': {'resolvers': [{'path': ['addCandidate'], 'parentType': 'Mutation', 'fieldName': 'addCandidate', 'returnType': 'AddCandidatePayload', 'startOffset': 145971, 'duration': 211176343, 'dgraph': [{'label': 'preMutationQuery', 'startOffset': 0, 'duration': 0}, {'label': 'mut

In [69]:
# Insert data from DataFrame with data cleaning
for index, row in nodes_df.iterrows():
    # Adjust column names based on the renamed ones
    name = row['name'].strip().replace('"', '')  # Use 'name' after renaming
    college = row['college'].strip().replace('"', '')  # Use 'college' after renaming
    year_of_pass_out = str(row['year_of_pass_out']).strip()  # Use 'year_of_pass_out' after renaming
    
    # Check if skills is not None before splitting
    if row['skills']:
        skills = [skill.strip().replace('"', '') for skill in row['skills'].split(',') if skill.strip()]
    else:
        skills = []  # Handle missing skills
    
    # Insert only if all required fields are available
    if name and college and year_of_pass_out and skills:
        print(f"Inserting: Name={name}, College={college}, Year={year_of_pass_out}, Skills={skills}")
        mutate_candidate_data(name, college, year_of_pass_out, skills)
    else:
        print(f"Skipping row {index} due to missing data.")

Inserting: Name=Chitturi Prasad,, College=KL University,, Year=2017, Skills=['Java']
Mutation query: 
    mutation {
        addCandidate(input: {
            name: "Chitturi Prasad,"
            college: {
                name: "KL University,"
            }
            year_of_pass_out: 2017
            skills: [{name: "Java"}]
        }) {
            # Note: Remove 'id' from the query if it causes an error
            candidate {
                name
            }
        }
    }
    
Successfully inserted data for Chitturi Prasad,
Response data: {'data': {'addCandidate': {'candidate': [{'name': 'Chitturi Prasad,'}]}}, 'extensions': {'touched_uids': 15, 'tracing': {'version': 1, 'startTime': '2024-10-03T17:26:53.196974464Z', 'endTime': '2024-10-03T17:26:53.454447148Z', 'duration': 257472684, 'execution': {'resolvers': [{'path': ['addCandidate'], 'parentType': 'Mutation', 'fieldName': 'addCandidate', 'returnType': 'AddCandidatePayload', 'startOffset': 177892, 'duration': 257289152, 

In [75]:
# Function to query candidates from Dgraph
def query_candidate_data():
    # Construct the query string
    query = '''
    query {
        queryCandidate {
            name
            college {
                name
            }
            year_of_pass_out
            skills {
                name
            }
        }
    }
    '''

    print("Query:", query)  # Debug output to see the query

    # Send the query request
    response = requests.post(DGRAPH_ENDPOINT, json={'query': query})

    # Check the response and print success or error
    if response.status_code == 200:
        print("Successfully retrieved data")
        print("Response data:", response.json())  # Print the response for debugging
    else:
        print("Error retrieving data:", response.json())

In [76]:
query_candidates()

Successfully queried data:
{'data': {'queryCandidate': [{'name': 'Chitturi Prasad', 'college': {'name': 'KL University'}, 'year_of_pass_out': 2017, 'skills': [{'name': 'Java'}]}, {'name': 'Kumar Saurav', 'college': {'name': 'Indian Institute of Engineering Science and Technology, Shibpur'}, 'year_of_pass_out': 2018, 'skills': [{'name': 'Financial Analysis'}]}, {'name': 'Rachelle Beaudry', 'college': {'name': 'City College'}, 'year_of_pass_out': 2020, 'skills': [{'name': 'SQL'}]}, {'name': 'Chitturi Prasad,', 'college': {'name': 'KL University,'}, 'year_of_pass_out': 2017, 'skills': [{'name': 'Java'}]}, {'name': 'Kumar Saurav,', 'college': {'name': 'Indian Institute of Engineering Science and Technology, Shibpur,'}, 'year_of_pass_out': 2018, 'skills': [{'name': 'Financial Analysis'}]}, {'name': 'Rachelle Beaudry,', 'college': {'name': 'City College,'}, 'year_of_pass_out': 2020, 'skills': [{'name': 'SQL'}]}]}, 'extensions': {'touched_uids': 54, 'tracing': {'version': 1, 'startTime': '202