In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('../data/jobs_dataset_with_features.csv')


In [3]:
df.head(2)

Unnamed: 0,Role,Features
0,Social Media Manager,5 to 15 Years Digital Marketing Specialist M.T...
1,Frontend Web Developer,"2 to 12 Years Web Developer BCA HTML, CSS, Jav..."


In [4]:
df.columns

Index(['Role', 'Features'], dtype='object')

In [5]:
# Dropping classes with less than 4000 instances
min_count = 4000
role_counts = df['Role'].value_counts()
dropped_classes = role_counts[role_counts < min_count].index
filtered_df = df[~df['Role'].isin(dropped_classes)].reset_index(drop=True)

# Checking the updated role counts
filtered_df['Role'].value_counts()

Role
Interaction Designer          20580
Network Administrator         17470
User Interface Designer       14036
Social Media Manager          13945
User Experience Designer      13935
                              ...  
Benefits Coordinator           6839
Research Analyst               6830
Administrative Coordinator     6803
IT Support Specialist          6799
UI/UX Designer                 6743
Name: count, Length: 61, dtype: int64

In [6]:
len(filtered_df['Role'].value_counts())
df = filtered_df.sample(n=10000)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Splitting the data into features (X) and target (y)
X = df['Features']
y = df['Role']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_tfidf, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test_tfidf)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [9]:
import re
def cleanResume(txt):
    cleanText = re.sub('http\S+\s', ' ', txt)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText


# Prediction and Category Name
def job_recommendation(resume_text):
    resume_text= cleanResume(resume_text)
    resume_tfidf = tfidf_vectorizer.transform([resume_text])
    predicted_category = rf_classifier.predict(resume_tfidf)[0]
    return predicted_category

In [12]:
# Example Usage
resume_file = """
Data Scientist Summary
Highly analytical and detail-oriented data scientist with 5+ years of experience in leveraging data to drive business decisions. Proficient in machine learning, statistical analysis, and data visualization. Skilled in Python, R, SQL, and big data technologies like Hadoop and Spark. Passionate about solving complex problems and delivering actionable insights through data-driven solutions.

Highlights
- Developed predictive models that improved customer retention by 20% for an e-commerce company.
- Built and deployed machine learning pipelines using Python and Scikit-Learn, reducing processing time by 30%.
- Expertise in data visualization tools like Tableau and Power BI, creating dashboards that informed executive decision-making.
- Strong background in A/B testing and experimental design, leading to a 15% increase in conversion rates.
- Published 2 research papers on natural language processing (NLP) and deep learning applications.

Experience
Company Name, City, State
Senior Data Scientist | Jan 2020 – Present
- Led a team of 4 data scientists in developing machine learning models for fraud detection, reducing fraudulent transactions by 25%.
- Designed and implemented a recommendation system that increased cross-selling revenue by 18%.
- Collaborated with engineering teams to integrate machine learning models into production systems.
- Conducted workshops to upskill team members in advanced machine learning techniques.
- Presented data-driven insights to stakeholders, influencing key business strategies.

Company Name, City, State
Data Scientist | Jun 2017 – Dec 2019
- Analyzed large datasets to identify trends and patterns, providing actionable insights for marketing campaigns.
- Built and maintained ETL pipelines using Apache Spark, improving data processing efficiency by 40%.
- Developed NLP models for sentiment analysis, achieving 90% accuracy in customer feedback classification.
- Automated reporting processes using Python scripts, saving 10 hours per week for the analytics team.
- Partnered with product managers to define key performance indicators (KPIs) and track business metrics.

Company Name, City, State
Data Analyst | Sep 2015 – May 2017
- Performed exploratory data analysis (EDA) to uncover insights from customer behavior data.
- Created interactive dashboards in Tableau to visualize sales performance and market trends.
- Assisted in the development of predictive models for customer segmentation.
- Wrote SQL queries to extract and manipulate data from relational databases.
- Supported data-driven decision-making by providing ad-hoc reports and analyses.

Education
Master of Science in Data Science
University of Data Science, City, State | Graduated May 2015
- Thesis: "Applications of Deep Learning in Natural Language Processing"
- Relevant coursework: Machine Learning, Big Data Analytics, Statistical Modeling

Bachelor of Science in Computer Science
State University, City, State | Graduated May 2013
- Graduated with honors (GPA: 3.9/4.0)
- Relevant coursework: Data Structures, Algorithms, Database Systems

Skills
- Programming: Python, R, SQL, Java
- Machine Learning: Scikit-Learn, TensorFlow, Keras, PyTorch
- Big Data: Hadoop, Spark, Hive
- Data Visualization: Tableau, Power BI, Matplotlib, Seaborn
- Statistical Analysis: Hypothesis Testing, Regression, Bayesian Methods
- Soft Skills: Problem-Solving, Communication, Team Collaboration

Certifications
- Certified Data Scientist (CDS) – Data Science Council of America (DASCA)
- Google Professional Data Engineer
- AWS Certified Machine Learning – Specialty

Professional Affiliations
- Member, Association for Computing Machinery (ACM)
- Member, Data Science Association (DSA)
- Member, Institute of Electrical and Electronics Engineers (IEEE)

Publications
- "Deep Learning for Sentiment Analysis in Social Media," Journal of Data Science, 2019.
- "Applications of NLP in Customer Feedback Analysis," International Conference on Machine Learning, 2020.

Languages
- English (Fluent)
- French (Conversational)
"""
predicted_category = job_recommendation(resume_file)
print("Predicted Category:", predicted_category)

Predicted Category: Business Intelligence Analyst


In [14]:
import pickle
pickle.dump(rf_classifier,open('../models/rf_classifier_job_recommendation.pkl','wb'))
pickle.dump(tfidf_vectorizer,open('../models/tfidf_vectorizer_job_recommendation.pkl','wb'))