In [1]:
import pandas as pd
import re

# Load your CSV file
file_path = 'Indeed_10k.csv'
data = pd.read_csv(file_path)

# Check for missing values
print("Missing values before cleaning:")
print(data.isnull().sum())

Missing values before cleaning:
Name          0
Company       0
City          0
Ratings    3776
Summary       0
Date          0
dtype: int64


In [2]:
# Fill missing Ratings with the mean value, or you can drop them if not needed
data['Ratings'] = data['Ratings'].fillna(data['Ratings'].mean())

In [3]:
# List of tech skills to search for in job summaries
skills_list = ['Python', 'Java', 'C#', 'C++', 'SQL', 'JavaScript', 'HTML', 'CSS', 
               'Ruby', 'PHP', 'Go', 'Swift', 'Kotlin', 'R', 'Matlab', 'Perl', 
               'Scala', 'Rust', 'Django', 'Flask', 'Node.js', 'React', 'Angular', 
               'Vue.js', 'Docker', 'Kubernetes', 'AWS', 'Azure', 'GCP']

# Function to extract skills from the 'Summary' column
def extract_skills(summary):
    found_skills = [skill for skill in skills_list if re.search(rf'\b{skill}\b', summary, re.IGNORECASE)]
    return ', '.join(found_skills) if found_skills else None

# Apply the function to create a new 'Skills' column
data['Skills'] = data['Summary'].apply(extract_skills)

# Drop rows where 'Skills' is missing, or handle accordingly
data = data.dropna(subset=['Skills'])

# Show cleaned dataset
print(data[['Name', 'Company', 'City', 'Skills']].head())

# Save the cleaned data to a new CSV
data.to_csv('cleaned_Indeed_10k.csv', index=False)

                                                Name              Company  \
0                      Entry level Software Engineer  CapitalPlanHoldings   
1                           Junior Software Engineer            FormSwift   
4                         Software Engineer (Python)                Imgix   
6  AppD Software Engineer Bachelor's (Intern) Uni...        Cisco Systems   
7            Software Engineer - University Graduate               PayPal   

                                         City              Skills  
0                           San Francisco, CA           Java, C++  
1                           San Francisco, CA  Python, JavaScript  
4                           San Francisco, CA              Python  
6                           San Francisco, CA                Java  
7  San Francisco, CA 94107 (South Beach area)   Python, Java, C++  


In [4]:
# Load the cleaned dataset
file_path = 'cleaned_Indeed_10k.csv'
data = pd.read_csv(file_path)

data.head()

Unnamed: 0,Name,Company,City,Ratings,Summary,Date,Skills
0,Entry level Software Engineer,CapitalPlanHoldings,"San Francisco, CA",3.918493,"Programming experience using C#, C++, or Java ...",2 days ago,"Java, C++"
1,Junior Software Engineer,FormSwift,"San Francisco, CA",3.918493,Building new product features across the back ...,30+ days ago,"Python, JavaScript"
2,Software Engineer (Python),Imgix,"San Francisco, CA",3.918493,Comfortable developing in Python (or similar)....,5 days ago,Python
3,AppD Software Engineer Bachelor's (Intern) Uni...,Cisco Systems,"San Francisco, CA",4.1,"Java server side web frameworks (SpringMVC, St...",4 days ago,Java
4,Software Engineer - University Graduate,PayPal,"San Francisco, CA 94107 (South Beach area)",3.9,Solid experience working with or familiarity a...,30+ days ago,"Python, Java, C++"


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

# Features (Skills) and Target (Job Title)
X = data['Skills']
y = data['Name']  # Job title as the label

# Convert the Skills column into numerical data using TF-IDF
vectorizer = TfidfVectorizer()
X_transformed = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [6]:
# Model: Multinomial Naive Bayes (can also use Logistic Regression)
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 61.73%
Classification Report:
                                                                 precision    recall  f1-score   support

                                 AI / Machine Learning Engineer       0.00      0.00      0.00         1
                                  API Backend Software Engineer       0.00      0.00      0.00         1
                                               Android Engineer       0.13      1.00      0.23        21
                                           AppD DevOps Engineer       0.00      0.00      0.00        32
                                              Backend Developer       0.67      0.41      0.51        34
                                               Backend Engineer       0.00      0.00      0.00         3
                                      Backend Software Engineer       0.00      0.00      0.00         2
                                    CUSTOMER SUCCESS SPECIALIST       0.00      0.00      0.00         1
              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
lg_model = LogisticRegression(max_iter=1000)
lg_model.fit(X_train, y_train)

# Predict on the test set
y_pred = lg_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 68.88%


In [8]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Convert the Skills column into numerical data using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer


# Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, verbose=False)

# Train the model
catboost_model.fit(X_train, y_train)

# Predictions
y_pred = catboost_model.predict(X_test)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 70.92%
Classification Report:
                                                                 precision    recall  f1-score   support

                                 AI / Machine Learning Engineer       0.00      0.00      0.00         1
                                  API Backend Software Engineer       0.00      0.00      0.00         1
                                               Android Engineer       0.13      1.00      0.23        21
                                           AppD DevOps Engineer       0.00      0.00      0.00        32
                                  AppD Senior Software Engineer       0.00      0.00      0.00         0
                                              Backend Developer       0.83      1.00      0.91        34
                                               Backend Engineer       0.00      0.00      0.00         3
                                      Backend Software Engineer       0.00      0.00      0.00         2
              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# Save the CatBoost model
catboost_model.save_model('catboost_model.cbm')

In [10]:
import pickle

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)