In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib


In [2]:
df = pd.read_csv('career_path_original.csv')

In [3]:
df.head(50)

Unnamed: 0,skills_required,Skill_level,interests,interest_Levels,Career,assessment_score
0,Python,beginner,Analysis,0.5,Data Scientist,0.5
1,Python,beginner,Analysis,0.7,Data Scientist,0.6
2,Python,beginner,Analysis,0.9,Data Scientist,0.7
3,Machine Learning,intermediate,Artificial Intelligence,0.5,Data Scientist,0.8
4,Machine Learning,intermediate,Artificial Intelligence,0.7,Data Scientist,0.9
5,Machine Learning,intermediate,Artificial Intelligence,0.9,Data Scientist,0.5
6,Statistics,advanced,Predictive Modeling,0.5,Data Scientist,0.6
7,Statistics,advanced,Predictive Modeling,0.7,Data Scientist,0.7
8,Statistics,advanced,Predictive Modeling,0.9,Data Scientist,0.8
9,Java,beginner,Software Development,0.5,Software Engineer,0.5


In [4]:
df.describe(include = "all")

Unnamed: 0,skills_required,Skill_level,interests,interest_Levels,Career,assessment_score
count,3090,3090,3090,3090.0,3090,3090.0
unique,690,3,810,,337,
top,Analysis,beginner,Research,,Archaeologist,
freq,78,1089,21,,27,
mean,,,,0.7,,0.676149
std,,,,0.163326,,0.132672
min,,,,0.5,,0.5
25%,,,,0.5,,0.6
50%,,,,0.7,,0.7
75%,,,,0.9,,0.8


In [5]:
# Finding the unique values for the various columns

## Define a function to strip whitespaces from a single element
def strip_whitespaces(element):
    return str(element).strip()

df['interests'] = df['interests'].apply(strip_whitespaces)

df['interests'].unique()

array(['Analysis', 'Artificial Intelligence', 'Predictive Modeling',
       'Software Development', 'Coding', 'Problem-Solving',
       'Product Development', 'User Feedback', 'User-Centered Design',
       'Usability Testing', 'Interpretation', 'Trends Analysis',
       'Visualization', 'Machine Learning Algorithms', 'Deep Learning',
       'Model Optimization', 'Web Design', 'Frontend Frameworks',
       'User Interface', 'Server-Side Logic', 'base Management',
       'API Development', 'End-to-End Development',
       'Cross-Functional Skills', 'Code Integration', 'Visual Design',
       'Web Layout', 'User Experience', 'Process Improvement',
       'Business Efficiency', 'Security', 'base Optimization',
       'Network Infrastructure', 'Connectivity Solutions',
       'Continuous Integration', 'Deployment Automation',
       'Cloud Architecture', 'Scalable Solutions', 'Sales Strategies',
       'Customer Engagement', 'Market Analysis', 'Marketing Campaigns',
       'Brand Promotion

So we have 337 different classes for career

In [6]:
## Since there are no missing values, we move straight to encoding the categorical columns
### Now we encode the categorical values since the machine cannot understand it
# Label encoding
le_skills_required = LabelEncoder()
le_skill_level = LabelEncoder()
le_int = LabelEncoder()
le_career = LabelEncoder()

df["interests"] = le_int.fit_transform(df["interests"])
df["skills_required"] = le_skills_required.fit_transform(df["skills_required"])
df["Skill_level"] = le_skill_level.fit_transform(df["Skill_level"])
df["Career"] = le_career.fit_transform(df["Career"])

In [7]:
df.head()

Unnamed: 0,skills_required,Skill_level,interests,interest_Levels,Career,assessment_score
0,503,1,33,0.5,84,0.5
1,503,1,33,0.7,84,0.6
2,503,1,33,0.9,84,0.7
3,364,2,46,0.5,84,0.8
4,364,2,46,0.7,84,0.9


There is no need to standardize the numeric data since they are all in between 0 and 1

In [8]:
y = df['Career']
X= df.drop("Career",axis =1)


In [9]:
y

0       84
1       84
2       84
3       84
4       84
        ..
3085    46
3086    46
3087    46
3088    46
3089    46
Name: Career, Length: 3090, dtype: int32

In [10]:
X

Unnamed: 0,skills_required,Skill_level,interests,interest_Levels,assessment_score
0,503,1,33,0.5,0.5
1,503,1,33,0.7,0.6
2,503,1,33,0.9,0.7
3,364,2,46,0.5,0.8
4,364,2,46,0.7,0.9
...,...,...,...,...,...
3085,107,2,201,0.7,0.9
3086,107,2,201,0.9,0.5
3087,526,0,627,0.5,0.6
3088,526,0,627,0.7,0.7


In [11]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)


In [12]:
# Define models to test
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'kNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': GaussianNB()
}

In [13]:
# Dictionary to store results
results = {}

# Perform grid search for each model
for model_name, model in models.items():
    print(f"\nTesting {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid={}, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Use the best model for predictions
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Evaluate the best model
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = {
        'best_model': best_model,
        'accuracy': accuracy
    }
    print(f'Accuracy: {accuracy}')
    print(classification_report(y_test, y_pred))


Testing Decision Tree...




Accuracy: 0.8964401294498382
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         2
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         4
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00         2
          16       1.00      1.00      1.00         2
          17       1.00      1.00      1.00         3
          18       1.00      1.00      1.00         6
          20       1.00      1.00      1.00         1
          22       1.00      1.00      1.00         2
          23       1.00      1.00      1.00         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.10032362459546926
              precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       0.50      0.50      0.50         2
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         2
           6       1.00      1.00      1.00         1
           7       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         4
          10       0.67      0.67      0.67         3
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         3
          18       1.00      0.50      0.67         6
          19       0.00      0.00      0.00         0
          20       0.25      1.00      0.40        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.008090614886731391
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.03      0.67      0.07         3
          18       0.00      0.00      0.00         6
          20       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         2
          23       0.00      0.00      0.00       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.01779935275080906
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         2
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         4
          10       0.00      0.00      0.00         3
          11       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.00      0.00      0.00         3
          18       0.00      0.00      0.00         6
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00        

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average,

### Using the grid search, I found that the best model for the prediction is the decision tree model which comes with an accuracy of 89


In [14]:
# Now to show the results for the testing
print(results)
#Identify the model with the highest accuracy
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['best_model']

{'Decision Tree': {'best_model': DecisionTreeClassifier(), 'accuracy': 0.8964401294498382}, 'Random Forest': {'best_model': RandomForestClassifier(), 'accuracy': 0.10032362459546926}, 'SVM': {'best_model': SVC(), 'accuracy': 0.008090614886731391}, 'kNN': {'best_model': KNeighborsClassifier(), 'accuracy': 0.22653721682847897}, 'Logistic Regression': {'best_model': LogisticRegression(), 'accuracy': 0.01779935275080906}, 'Naive Bayes': {'best_model': GaussianNB(), 'accuracy': 0.032362459546925564}}


In [15]:
best_model

# Testing on manual data

In [16]:
# Manually input values for each feature
new_data_manual = pd.DataFrame({
    'skills_required': ['Python'],  
    'Skill_level': ['beginner'],  
    'interests': ['Analysis'], 
    'interest_Levels': [0.3],
    'assessment_score': [0.7]
})

# Label encoding for the manual data input
new_data_manual["skills_required"] = le_skills_required.transform(new_data_manual["skills_required"])
new_data_manual["Skill_level"] = le_skill_level.transform(new_data_manual["Skill_level"])
new_data_manual["interests"] = le_int.transform(new_data_manual["interests"])


In [17]:

X_manual = new_data_manual[['skills_required', 'Skill_level', 'interests', 'interest_Levels','assessment_score']]

# Making predictions using the best model
predictions_encoded_manual = best_model.predict(X_manual)

# Inverse transform the predicted labels to get the original labels
predictions_original_manual = le_career.inverse_transform(predictions_encoded_manual)

# Display the original labels
print(predictions_original_manual)


['Data Scientist']


# Save the model to a file

In [18]:
joblib.dump(best_model, 'career_model.joblib')

['career_model.joblib']

# Loading and using the model

In [19]:
# Load the saved model from the file
loaded_model = joblib.load('career_model.joblib')


In [20]:
# Manually input values for each feature
new_data_manual = pd.DataFrame({
    'skills_required': ['Python'],  
    'Skill_level': ['beginner'],  
    'interests': ['Analysis'], 
    'interest_Levels': [0.3],
    'assessment_score': [0.7]
})

# Label encoding for the manual data input
new_data_manual["skills_required"] = le_skills_required.transform(new_data_manual["skills_required"])
new_data_manual["Skill_level"] = le_skill_level.transform(new_data_manual["Skill_level"])
new_data_manual["interests"] = le_int.transform(new_data_manual["interests"])
X_manual = new_data_manual[['skills_required', 'Skill_level', 'interests', 'interest_Levels','assessment_score']]

# Making predictions using the career model
predictions_encoded_manual = loaded_model.predict(X_manual)

# Inverse transform the predicted labels to get the original labels
predictions_original_manual = le_career.inverse_transform(predictions_encoded_manual)

# Display the original labels
print(predictions_original_manual)


['Data Scientist']


In [22]:
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
