# Career Recommender

## Importing Libraries

In [1]:
# Getting Numpy and Pandas to load and view data 
import pandas as pd 
import numpy as np

# For splitting into training and testing 
from sklearn.model_selection import train_test_split

# Getting Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV

from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.preprocessing import LabelEncoder

from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

import pickle as pk

## Loading data 

In [3]:
oip_df = pd.read_csv('/Users/mukulhooda/Desktop/SIH/model_and_data/oip_transformed.csv')
occupations_df=pd.read_excel('/Users/mukulhooda/Desktop/SIH/model_and_data/Occupation Data.xlsx')

In [4]:
occupations_df

Unnamed: 0,O*NET-SOC Code,Title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."
...,...,...,...
1011,55-3014.00,Artillery and Missile Crew Members,"Target, fire, and maintain weapons used to des..."
1012,55-3015.00,Command and Control Center Specialists,"Operate and monitor communications, detection,..."
1013,55-3016.00,Infantry,Operate weapons and equipment in ground combat...
1014,55-3018.00,Special Forces,"Implement unconventional operations by air, la..."


In [5]:
oip_df

Unnamed: 0,O*NET-SOC Code,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
0,11-1011.00,2.67,5.33,7.00,2.00,1.33,3.67
1,11-1011.03,2.67,4.33,7.00,4.33,1.00,2.33
2,11-1021.00,1.00,3.67,7.00,1.33,1.33,3.33
3,11-1031.00,3.67,3.00,7.00,3.67,1.00,4.67
4,11-2011.00,5.33,4.67,7.00,2.00,1.67,2.33
...,...,...,...,...,...,...,...
869,53-7071.00,1.00,3.00,2.00,1.33,7.00,1.00
870,53-7072.00,1.00,4.67,2.00,4.00,7.00,1.33
871,53-7073.00,1.00,5.00,1.33,3.67,7.00,1.00
872,53-7081.00,1.00,3.67,2.33,1.33,7.00,1.00


## Performing data augmentation on our dataset to diversify our data

In [6]:
data = oip_df

# Number of additional entries to generate
num_entries_to_generate = 7

# Initialising new DataFrame
generated_data = pd.DataFrame(columns=data.columns)

# Generate new entries around the mean of existing data
for index, row in data.iterrows():
    for _ in range(num_entries_to_generate):
        new_row = row.copy()
        new_row['Artistic'] = np.round(np.abs(np.random.normal(row['Artistic'], 0.2)), 2)
        new_row['Conventional'] = np.round(np.abs(np.random.normal(row['Conventional'], 0.2)), 2)
        new_row['Enterprising'] = np.round(np.abs(np.random.normal(row['Enterprising'], 0.2)), 2)
        new_row['Investigative'] = np.round(np.abs(np.random.normal(row['Investigative'], 0.2)), 2)
        new_row['Realistic'] = np.round(np.abs(np.random.normal(row['Realistic'], 0.2)), 2)
        new_row['Social'] = np.round(np.abs(np.random.normal(row['Social'], 0.2)), 2)
        generated_data = pd.concat([generated_data, new_row.to_frame().T], ignore_index=True)

# Display the generated data
generated_data

Unnamed: 0,O*NET-SOC Code,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
0,11-1011.00,2.6,5.34,6.82,1.73,1.61,3.53
1,11-1011.00,2.45,5.18,7.42,1.8,1.53,3.68
2,11-1011.00,2.39,5.47,7.14,2.31,1.36,3.91
3,11-1011.00,2.92,5.29,7.21,2.06,1.39,3.57
4,11-1011.00,2.67,5.08,7.15,1.96,1.21,3.67
...,...,...,...,...,...,...,...
6113,53-7121.00,0.45,4.6,1.65,2.81,6.99,1.55
6114,53-7121.00,1.23,4.99,2.06,2.81,6.8,0.91
6115,53-7121.00,1.05,5.27,1.94,2.9,7.07,1.49
6116,53-7121.00,0.98,5.07,2.34,2.63,6.82,1.27


In [7]:
display(data)

Unnamed: 0,O*NET-SOC Code,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
0,11-1011.00,2.67,5.33,7.00,2.00,1.33,3.67
1,11-1011.03,2.67,4.33,7.00,4.33,1.00,2.33
2,11-1021.00,1.00,3.67,7.00,1.33,1.33,3.33
3,11-1031.00,3.67,3.00,7.00,3.67,1.00,4.67
4,11-2011.00,5.33,4.67,7.00,2.00,1.67,2.33
...,...,...,...,...,...,...,...
869,53-7071.00,1.00,3.00,2.00,1.33,7.00,1.00
870,53-7072.00,1.00,4.67,2.00,4.00,7.00,1.33
871,53-7073.00,1.00,5.00,1.33,3.67,7.00,1.00
872,53-7081.00,1.00,3.67,2.33,1.33,7.00,1.00


In [8]:
result = pd.concat([generated_data, data], axis=0, join='inner')
display(result)

Unnamed: 0,O*NET-SOC Code,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
0,11-1011.00,2.6,5.34,6.82,1.73,1.61,3.53
1,11-1011.00,2.45,5.18,7.42,1.8,1.53,3.68
2,11-1011.00,2.39,5.47,7.14,2.31,1.36,3.91
3,11-1011.00,2.92,5.29,7.21,2.06,1.39,3.57
4,11-1011.00,2.67,5.08,7.15,1.96,1.21,3.67
...,...,...,...,...,...,...,...
869,53-7071.00,1.0,3.0,2.0,1.33,7.0,1.0
870,53-7072.00,1.0,4.67,2.0,4.0,7.0,1.33
871,53-7073.00,1.0,5.0,1.33,3.67,7.0,1.0
872,53-7081.00,1.0,3.67,2.33,1.33,7.0,1.0


# Building Model

In [9]:
# KNN 

career = result

y = career["O*NET-SOC Code"]
x = career.drop('O*NET-SOC Code', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

knn = KNeighborsClassifier(n_neighbors=7)
scores = {}
knn.fit(x, y)

y_pred = knn.predict(x_train)
# print('y_pred', y_pred)

score=accuracy_score(y_train, y_pred)
print('Accuracy=', score)


Accuracy= 0.9006483600305111


In [10]:
# For a single output , that is the top neighbour 

a=pd.DataFrame([6.6,5,7,5.6,8,9]).transpose()

b=knn.predict(a)
b[0]
# occupations_df[occupations_df['O*NET-SOC Code']=='29-1129.02']
occupations_df[occupations_df['O*NET-SOC Code']==b[0]]



Unnamed: 0,O*NET-SOC Code,Title,Description
359,25-9021.00,Farm and Home Management Educators,Instruct and advise individuals and families e...


In [11]:
y = career["O*NET-SOC Code"]
x = career.drop('O*NET-SOC Code', axis=1)


# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Scale the features
# scaler = StandardScaler()
# x_train_scaled = scaler.fit_transform(x_train)
# x_test_scaled = scaler.transform(x_test)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_neighbors': [5, 7, 9],  # K
    'weights': ['uniform', 'distance'],  # distance weighting
    'metric': ['euclidean', 'manhattan']  # distance metrics
}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=7)

# grid_search.fit(x_train_scaled, y_train)
grid_search.fit(x_train, y_train)


best_knn = grid_search.best_estimator_

best_score = grid_search.best_score_

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", best_score)

# Fit the best model on the training data
# best_knn.fit(x_train_scaled, y_train)
best_knn.fit(x_train, y_train)


# Predict on the test data
y_pred = best_knn.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", accuracy)

# Additional evaluation metrics
# print(classification_report(y_test, y_pred))

k=best_knn.get_params()['n_neighbors']



Best Parameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Best Cross-Validation Accuracy: 0.8478232564053659
Test Set Accuracy: 0.8249427917620137


In [12]:
# Find the top k nearest neighbors for each data point in the training set
distances, indices = best_knn.kneighbors(x_test)

# Create a dictionary to store the top k predictions for each sample
top_k_predictions = {}

# Get the top k predictions for each data point
for i, neighbors in enumerate(indices):
    neighbor_predictions = y_train.iloc[neighbors].tolist()
    unique_predictions = list(set(neighbor_predictions))  # Get unique predictions
    if len(unique_predictions) < k:
        # If there are fewer than k unique predictions, fill the rest with duplicates
        unique_predictions.extend([unique_predictions[0]] * (k - len(unique_predictions)))
    top_k_predictions[i] = unique_predictions[:k]

# Print the top 5 predictions for the first data point as an example
print("Top 5 predictions for the first data point:", top_k_predictions[0])

Top 5 predictions for the first data point: ['45-2093.00', '53-7063.00', '47-2141.00', '51-3022.00', '45-2093.00']


In [13]:
# List of values you want to compare
values_to_compare = top_k_predictions[0]
df=occupations_df
# Extract rows from the DataFrame where any column matches the values_to_compare
filtered_df = df[df.isin(values_to_compare).any(axis=1)]

display(filtered_df)

Unnamed: 0,O*NET-SOC Code,Title,Description
701,45-2093.00,"Farmworkers, Farm, Ranch, and Aquacultural Ani...","Attend to live farm, ranch, open range or aqua..."
731,47-2141.00,"Painters, Construction and Maintenance","Paint walls, equipment, buildings, bridges, an..."
839,51-3022.00,"Meat, Poultry, and Fish Cutters and Trimmers",Use hands or hand tools to perform routine cut...
988,53-7063.00,Machine Feeders and Offbearers,Feed materials into or remove materials from m...


In [13]:
# Dumping our model in a pkl(pikel) file for future use

with open('KNN.pkl','wb') as file:
    pk.dump(best_knn,file)