In [39]:
import numpy as np
import pandas as pd

In [40]:
job_df = pd.read_csv("job_descriptions.csv")

In [None]:
job_df.head(2)

In [None]:
job_df.columns

In [None]:
# List of columns to include
columns_to_include = [
    'Experience', 'Qualifications', 'Salary Range', 'location',
    'Country', 'Work Type', 'Company Size',
    'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
    'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
    'skills', 'Responsibilities', 'Company'
]

# Filter job_df to include only the specified columns
job_df = job_df[columns_to_include]

# Verify the result
print(job_df.head(2))


In [None]:
job_df.columns

In [None]:
print(len(job_df))

In [None]:
job_df.shape

In [None]:
job_df.info()

In [None]:
job_df['Job Description'][20]

In [49]:
# Convert 'Job Posting Date' to datetime
job_df['Job Posting Date'] = pd.to_datetime(job_df['Job Posting Date'])

In [None]:
job_df.isnull().sum()
job_df.fillna('',inplace=True)
job_df.isnull().sum()

# cleaning dataset
keeping all letters and digits                          
lover case                             
removing stopwords                            
tokenization                            
stemming                         

In [51]:
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [52]:
def cleaning(txt):
#     step 1
    txt = re.sub(r'[^a-zA-Z0-9\s]','',txt)
#     step 2
    tokens = nltk.word_tokenize(txt.lower())
    # step 3 and 5
    stemming = [ps.stem(w) for w in tokens if w not in stopwords.words('english')]
    return " ".join(stemming)
    

In [None]:
cleaning("\n\rhelo the master piece is my loving moving cat @9032#%$")

In [54]:
job_df_sample = job_df.sample(n=10000)  # Random sample of 10,000 rows for testing


In [56]:
# Convert 'Company Size' to integer
job_df['Company Size'] = job_df['Company Size'].astype(int)




In [None]:
# Apply cleaning function to text-based columns
text_columns = ['Experience', 'Qualifications', 'Salary Range', 'location', 
                'Country', 'Work Type', 'Preference', 'Contact Person', 
                'Contact', 'Job Title', 'Role', 'Job Portal', 
                'Job Description', 'Benefits', 'skills', 'Responsibilities', 'Company']

# Clean each column
for col in text_columns:
    job_df_sample[col] = job_df_sample[col].astype(str).apply(lambda x: cleaning(x))

In [None]:
# Combine cleaned text into a single column
job_df_sample['clean_text'] = job_df_sample[text_columns].apply(lambda row: ' '.join(row), axis=1)

# Check the combined text
print(job_df_sample['clean_text'].head())

# vectorizatoin

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(job_df_sample['clean_text'])
similarity = cosine_similarity(matrix)

In [None]:
similarity

In [None]:
sorted(list(enumerate(similarity[0])), key=lambda x: x[1], reverse=True)[1:20]

# Recommendation System

In [None]:
def recommend_jobs():
    # Get inputs from the user
    job_title = input("Enter the job title: ").strip()
    work_type = input("Enter the work type (e.g., Full-Time, Part-Time, etc.): ").strip()
    preference = input("Enter your preference (e.g., Remote, Onsite, Hybrid): ").strip()
    
    # Filter jobs based on work type and preference
    filtered_jobs = job_df.copy()
    if work_type:
        filtered_jobs = filtered_jobs[filtered_jobs['Work Type'].str.contains(work_type, case=False, na=False)]
    if preference:
        filtered_jobs = filtered_jobs[filtered_jobs['Preference'].str.contains(preference, case=False, na=False)]
    
    # Ensure the filtered dataset is not empty
    if filtered_jobs.empty:
        print("No jobs found matching the given criteria.")
        return None
    
    # Get the index of the given title in the filtered dataset
    try:
        indx = filtered_jobs[filtered_jobs['Job Title'].str.contains(job_title, case=False, na=False)].index[0]
        indx = job_df.index.get_loc(indx)  # Original index for similarity
    except IndexError:
        print("The specified job title does not exist in the dataset.")
        return None
    
    # Compute similarity and sort
    distances = sorted(list(enumerate(similarity[indx])), key=lambda x: x[1], reverse=True)[1:20]
    
    # Collect recommendations
    jobs = []
    for i in distances:
        recommended_job = job_df.iloc[i[0]]
        if recommended_job.name in filtered_jobs.index:  # Check if it's in the filtered DataFrame
            jobs.append(recommended_job)
    
    # Convert recommendations to a DataFrame
    recommended_df = pd.DataFrame(jobs)
    
    # Display recommendations
    if not recommended_df.empty:
        print("\nRecommended Jobs:")
        display_cols = ['Experience', 'Qualifications', 'Salary Range', 'location', 
                        'Country', 'Work Type', 'Preference', 'Contact Person', 
                        'Contact', 'Job Title', 'Role', 'Job Portal', 
                        'Job Description', 'Benefits', 'skills', 'Responsibilities', 'Company']
        print(recommended_df[display_cols].head(10))  # Show top 10 recommendations
    else:
        print("No similar jobs found within the filtered criteria.")
    
    return recommended_df[display_cols]

# Example Usage
recommendations = recommend_jobs()


In [None]:
def recommend_jobs():
    # Get inputs from the user
    job_title = input("Enter the job title (or leave blank to skip): ").strip()
    work_type = input("Enter the work type (e.g., Full-Time, Part-Time, etc.) (or leave blank to skip): ").strip()
    preference = input("Enter your preference (e.g., Female,Male,Both) (or leave blank to skip): ").strip()
    
    # Start with the complete dataset
    filtered_jobs = job_df.copy()

    # Apply filters based on inputs
    if job_title:
        filtered_jobs = filtered_jobs[filtered_jobs['Job Title'].str.contains(job_title, case=False, na=False)]
    if work_type:
        filtered_jobs = filtered_jobs[filtered_jobs['Work Type'].str.contains(work_type, case=False, na=False)]
    if preference:
        filtered_jobs = filtered_jobs[filtered_jobs['Preference'].str.contains(preference, case=False, na=False)]
    
    # Ensure there are matching jobs
    if filtered_jobs.empty:
        print("No jobs found matching the given criteria.")
        return None
    
    # Display total number of results found
    print(f"\nTotal number of jobs found: {filtered_jobs.shape[0]}")
    
    # Display results in grid format
    display_cols = ['Job Title', 'Work Type', 'Preference', 'Experience', 'Qualifications', 
                    'Salary Range', 'location', 'Country', 'Contact Person', 
                    'Contact', 'Role', 'Job Portal', 'Job Description', 
                    'Benefits', 'skills', 'Responsibilities', 'Company']
    
    filtered_jobs = filtered_jobs[display_cols]
    filtered_jobs.reset_index(drop=True, inplace=True)
    filtered_jobs.index += 1  # Start numbering from 1
    
    # Style the DataFrame for presentation
    print("\nFiltered Jobs:")
    styled_df = filtered_jobs.style.set_properties(**{
        'border': '1px solid black',
        'text-align': 'left',
        'background-color': '#f9f9f9'
    }).set_table_styles([{'selector': 'th', 'props': [('border', '1px solid black'), 
                                                    ('background-color', '#d9d9d9'), 
                                                    ('text-align', 'left')]}])
    
    # Use display() for Jupyter Notebook (if you're in Jupyter environment)
    display(styled_df)
    
    return filtered_jobs

# Example Usage
recommendations = recommend_jobs()


In [33]:
import pickle
pickle.dump(job_df,open('df.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [None]:
import pandas as pd
print(pd.__version__)
