In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

import datetime

In [3]:
df = pd.read_csv("../dataset/numerified_resume_data.csv")
df.head(2)

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met,matched_score
0,21,0,0,7,6,0,1,0,0,1,1,1,1,0,0.85
1,10,0,0,7,2,3,2,0,0,1,1,1,1,0,0.75


In [4]:
X = df.drop(["matched_score"], axis=1)
X

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met
0,21,0,0,7,6,0,1,0,0,1,1,1,1,0
1,10,0,0,7,2,3,2,0,0,1,1,1,1,0
2,14,1,0,8,5,0,1,0,0,1,1,1,1,0
3,36,1,0,15,14,0,0,0,0,5,1,1,0,1
4,32,1,0,10,6,0,1,0,0,5,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9455,8,1,0,6,1,0,1,0,0,1,1,1,1,0
9456,16,0,0,6,3,2,0,0,0,1,1,1,1,1
9457,10,1,0,7,6,0,0,0,0,1,1,1,1,0
9458,12,1,0,7,2,0,1,0,0,1,1,1,1,1


In [5]:
y = df["matched_score"]
y

0       0.850000
1       0.750000
2       0.416667
3       0.760000
4       0.650000
          ...   
9455    0.683333
9456    0.650000
9457    0.650000
9458    0.650000
9459    0.650000
Name: matched_score, Length: 9460, dtype: float64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.33, 
    random_state=77
)

In [7]:
X_train.shape

(6338, 14)

In [8]:
X_train.head(1)

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met
2729,47,1,0,25,20,0,2,0,0,7,1,1,0,1


In [9]:
X_test.shape

(3122, 14)

In [10]:
X_test.head(1)

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met
461,35,1,0,10,9,0,2,0,0,3,1,1,1,1


In [11]:
y_train.shape

(6338,)

In [12]:
y_train

2729    0.783333
8224    0.693333
1131    0.350000
7390    0.550000
6942    0.793333
          ...   
4832    0.650000
9119    0.623333
7832    0.683333
2283    0.683333
8799    0.793333
Name: matched_score, Length: 6338, dtype: float64

In [13]:
y_test.shape

(3122,)

In [14]:
y_test

461     0.476667
3407    0.626667
921     0.750000
9317    0.600000
4554    0.783333
          ...   
2653    0.380000
5291    0.383333
3967    0.850000
6056    0.550000
8498    0.826667
Name: matched_score, Length: 3122, dtype: float64

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
with open('../pickled_data/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [17]:
model = Sequential([
    # First hidden layer (connected to input)
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    
    # Second hidden layer
    Dense(32, activation='relu'),
    
    # Output layer
    Dense(1, activation='sigmoid')
])




In [18]:
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)




In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                960       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3073 (12.00 KB)
Trainable params: 3073 (12.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# Create log directory with timestamp
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Initialize TensorBoard callback
tensorboard_callback = TensorBoard(
    log_dir=log_dir,
    histogram_freq=1
)

In [21]:
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [22]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks=[tensorboard_callback, early_stopping_callback]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


In [23]:
# Load TensorBoard extension
%load_ext tensorboard

# Launch TensorBoard
%tensorboard --logdir logs/fit

In [24]:
y_pred = model.predict(X_test_scaled).flatten()

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("MAE :", mae)
print("MSE :", mse)
print("RMSE:", rmse)

MAE : 0.1629160425450113
MSE : 0.038179355836851454
RMSE: 0.1953953833560339


In [25]:
X_test_scaled[0]

array([ 0.73693949,  0.46454993, -0.01256198, -0.02855184,  0.30403827,
       -0.07268793,  1.53601404, -0.16628665, -0.26441354,  0.1481508 ,
        0.        ,  0.        ,  0.46203771,  0.85954657])

In [26]:
y_test.iloc[0]

0.47666666

In [27]:
model.predict(
    X_test_scaled[0].reshape(1,-1)
)[0][0]



0.77271175

In [28]:
def actual_to_predict(model, index):
    x = X_test_scaled[index]
    y = y_test.iloc[index]
    y_pred = model.predict(X_test_scaled[index].reshape(1,-1))[0][0]
    print(f"For x: {x}")
    print(f"Actual: {y}")
    print(f"Predicted: {y_pred}")

In [29]:
actual_to_predict(model, 1)

For x: [-0.76367702  0.46454993 -0.01256198  0.3434144   0.63304537  0.03823853
 -1.31346846 -0.16628665 -0.26441354 -1.37864556  0.          0.
  0.46203771 -1.1634041 ]
Actual: 0.62666667
Predicted: 0.4527641832828522


In [30]:
y_test

461     0.476667
3407    0.626667
921     0.750000
9317    0.600000
4554    0.783333
          ...   
2653    0.380000
5291    0.383333
3967    0.850000
6056    0.550000
8498    0.826667
Name: matched_score, Length: 3122, dtype: float64

In [31]:
y_pred

array([0.77271175, 0.45276418, 0.51575834, ..., 0.531865  , 0.8364052 ,
       0.6090272 ], dtype=float32)

In [32]:
y_test - y_pred

461    -0.296045
3407    0.173902
921     0.234242
9317    0.024643
4554   -0.084567
          ...   
2653   -0.186953
5291   -0.190513
3967    0.318135
6056   -0.286405
8498    0.217639
Name: matched_score, Length: 3122, dtype: float64

In [33]:
error_df = pd.DataFrame({
    "y_test": y_test,
    "y_pred": y_pred,
    "abs_error": np.abs(y_test - y_pred),
    "percent_error": np.abs(y_test - y_pred) * 100
})
error_df

Unnamed: 0,y_test,y_pred,abs_error,percent_error
461,0.476667,0.772712,0.296045,29.604509
3407,0.626667,0.452764,0.173902,17.390249
921,0.750000,0.515758,0.234242,23.424166
9317,0.600000,0.575357,0.024643,2.464322
4554,0.783333,0.867900,0.084567,8.456653
...,...,...,...,...
2653,0.380000,0.566953,0.186953,18.695336
5291,0.383333,0.573846,0.190513,19.051289
3967,0.850000,0.531865,0.318135,31.813500
6056,0.550000,0.836405,0.286405,28.640522


In [34]:
model.save('../pickled_data/model.h5')

  saving_api.save_model(
