In [375]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import pickle

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

import datetime

In [376]:
df = pd.read_csv("../dataset/numerified_resume_data.csv")
df.head(2)

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met,matched_score
0,21,0,0,7,6,0,1,0,0,1,1,1,1,0,0.85
1,10,0,0,7,2,3,2,0,0,1,1,1,1,0,0.75


In [377]:
X = df.drop(["matched_score"], axis=1)
X

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met
0,21,0,0,7,6,0,1,0,0,1,1,1,1,0
1,10,0,0,7,2,3,2,0,0,1,1,1,1,0
2,14,1,0,8,5,0,1,0,0,1,1,1,1,0
3,36,1,0,15,14,0,0,0,0,5,1,1,0,1
4,32,1,0,10,6,0,1,0,0,5,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9455,8,1,0,6,1,0,1,0,0,1,1,1,1,0
9456,16,0,0,6,3,2,0,0,0,1,1,1,1,1
9457,10,1,0,7,6,0,0,0,0,1,1,1,1,0
9458,12,1,0,7,2,0,1,0,0,1,1,1,1,1


In [378]:
y = df["matched_score"]
y

0       0.850000
1       0.750000
2       0.416667
3       0.760000
4       0.650000
          ...   
9455    0.683333
9456    0.650000
9457    0.650000
9458    0.650000
9459    0.650000
Name: matched_score, Length: 9460, dtype: float64

In [379]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.33, 
    random_state=77
)

In [380]:
X_train.shape

(6338, 14)

In [381]:
X_train.head(1)

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met
2729,47,1,0,25,20,0,2,0,0,7,1,1,0,1


In [382]:
X_test.shape

(3122, 14)

In [383]:
X_test.head(1)

Unnamed: 0,num_skills,required_skill_count,skills_matching,years_of_experience,experience_gap,education_duration_years,degree_level,num_certifications,num_languages,num_companies,num_responsibilities,job_responsibility_count,education_match,age_requirement_met
461,35,1,0,10,9,0,2,0,0,3,1,1,1,1


In [384]:
y_train.shape

(6338,)

In [385]:
y_train

2729    0.783333
8224    0.693333
1131    0.350000
7390    0.550000
6942    0.793333
          ...   
4832    0.650000
9119    0.623333
7832    0.683333
2283    0.683333
8799    0.793333
Name: matched_score, Length: 6338, dtype: float64

In [386]:
y_test.shape

(3122,)

In [387]:
y_test

461     0.476667
3407    0.626667
921     0.750000
9317    0.600000
4554    0.783333
          ...   
2653    0.380000
5291    0.383333
3967    0.850000
6056    0.550000
8498    0.826667
Name: matched_score, Length: 3122, dtype: float64

In [388]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
with open('../pickled_data/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [390]:
model = Sequential([
    # First hidden layer (connected to input)
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    
    # Second hidden layer
    Dense(32, activation='relu'),
    
    # Output layer
    Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [391]:
model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

In [392]:
model.summary()

In [393]:
# Create log directory with timestamp
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Initialize TensorBoard callback
tensorboard_callback = TensorBoard(
    log_dir=log_dir,
    histogram_freq=1
)

In [394]:
early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [395]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks=[tensorboard_callback, early_stopping_callback]
)

Epoch 1/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0360 - mae: 0.1490 - val_loss: 0.0266 - val_mae: 0.1282
Epoch 2/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0276 - mae: 0.1296 - val_loss: 0.0254 - val_mae: 0.1263
Epoch 3/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0269 - mae: 0.1276 - val_loss: 0.0395 - val_mae: 0.1661
Epoch 4/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0273 - mae: 0.1287 - val_loss: 0.0244 - val_mae: 0.1231
Epoch 5/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0252 - mae: 0.1235 - val_loss: 0.0255 - val_mae: 0.1239
Epoch 6/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0261 - mae: 0.1257 - val_loss: 0.0260 - val_mae: 0.1215
Epoch 7/100
[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/

In [396]:
# Load TensorBoard extension
%load_ext tensorboard

# Launch TensorBoard
%tensorboard --logdir logs/fit

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 20064), started 0:30:08 ago. (Use '!kill 20064' to kill it.)

In [397]:
y_pred = model.predict(X_test_scaled).flatten()

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("MAE :", mae)
print("MSE :", mse)
print("RMSE:", rmse)

[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step
MAE : 0.16406926617378195
MSE : 0.03827615387826225
RMSE: 0.19564292442677872


In [398]:
X_test_scaled[0]

array([ 0.73693949,  0.46454993, -0.01256198, -0.02855184,  0.30403827,
       -0.07268793,  1.53601404, -0.16628665, -0.26441354,  0.1481508 ,
        0.        ,  0.        ,  0.46203771,  0.85954657])

In [399]:
y_test.iloc[0]

np.float64(0.47666666)

In [400]:
model.predict(
    X_test_scaled[0].reshape(1,-1)
)[0][0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


np.float32(0.5925788)

In [401]:
def actual_to_predict(model, index):
    x = X_test_scaled[index]
    y = y_test.iloc[index]
    y_pred = model.predict(X_test_scaled[index].reshape(1,-1))[0][0]
    print(f"For x: {x}")
    print(f"Actual: {y}")
    print(f"Predicted: {y_pred}")

In [408]:
actual_to_predict(model, 1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
For x: [-0.76367702  0.46454993 -0.01256198  0.3434144   0.63304537  0.03823853
 -1.31346846 -0.16628665 -0.26441354 -1.37864556  0.          0.
  0.46203771 -1.1634041 ]
Actual: 0.62666667
Predicted: 0.6104281544685364


In [403]:
y_test

461     0.476667
3407    0.626667
921     0.750000
9317    0.600000
4554    0.783333
          ...   
2653    0.380000
5291    0.383333
3967    0.850000
6056    0.550000
8498    0.826667
Name: matched_score, Length: 3122, dtype: float64

In [404]:
y_pred

array([0.5925788 , 0.61042815, 0.6312284 , ..., 0.58277833, 0.72662395,
       0.6009309 ], shape=(3122,), dtype=float32)

In [405]:
y_test - y_pred

461    -0.115912
3407    0.016239
921     0.118772
9317    0.047820
4554    0.129404
          ...   
2653   -0.149991
5291   -0.136015
3967    0.267222
6056   -0.176624
8498    0.225736
Name: matched_score, Length: 3122, dtype: float64

In [406]:
error_df = pd.DataFrame({
    "y_test": y_test,
    "y_pred": y_pred,
    "abs_error": np.abs(y_test - y_pred),
    "percent_error": np.abs(y_test - y_pred) * 100
})
error_df

Unnamed: 0,y_test,y_pred,abs_error,percent_error
461,0.476667,0.592579,0.115912,11.591217
3407,0.626667,0.610428,0.016239,1.623852
921,0.750000,0.631228,0.118772,11.877161
9317,0.600000,0.552180,0.047820,4.782037
4554,0.783333,0.653930,0.129404,12.940377
...,...,...,...,...
2653,0.380000,0.529991,0.149991,14.999103
5291,0.383333,0.519348,0.136015,13.601493
3967,0.850000,0.582778,0.267222,26.722167
6056,0.550000,0.726624,0.176624,17.662395


In [None]:
model.save('../pickled_data/model.h5')

