In [15]:
%pip install scikit-learn pandas joblib





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### KNN Model

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the data
df = pd.read_csv('../DataEngineering/reporting/merged.csv')

# Group by 'emp_id' and 'learning_path_id' for aggregation
grouped_df = df.groupby(['emp_id', 'learning_path_id']).agg(
    avg_completion_rate=('completion_rate', 'mean'),
    avg_test_score_normalized=('test_score_normalized', 'mean'),
    avg_success_rate=('success_rate', 'mean'),
    total_time_spent=('time_spent_in_sec', 'sum')
).reset_index()

# Calculate the combined score for each learning path
grouped_df['combined_score'] = (
    (grouped_df['avg_completion_rate'] * 0.2) +
    (grouped_df['avg_success_rate'] * 0.3) +
    (grouped_df['avg_test_score_normalized'] * 0.5)
)

# Drop the intermediate columns now that we have combined_score
grouped_df.drop(columns=['avg_completion_rate', 'avg_success_rate', 'avg_test_score_normalized'], inplace=True)

# Pivot the data to get combined_score and other metrics in columns by learning_path_id
pivot_df = grouped_df.pivot(index='emp_id', columns='learning_path_id', values=['combined_score'])

# Flatten the MultiIndex columns for easier access
pivot_df.columns = [f'{col[0]}_{col[1]}' for col in pivot_df.columns]
pivot_df.reset_index(inplace=True)

# Aggregate other features (like total_time_spent)
emp_df = grouped_df.groupby(['emp_id']).agg({
    'total_time_spent': 'mean'
}).reset_index()

# Merge with pivot_df to get combined scores
df = emp_df.merge(pivot_df, on='emp_id', how='left')

# Fill NaN values with 0 (if there are any missing values after the merge)
df.fillna(0, inplace=True)

# Load the best learning paths data and join on emp_id
best_learning_paths = pd.read_csv('../DataEngineering/reporting/best_learning_paths.csv')
df = df.merge(best_learning_paths, on='emp_id')
df.drop(columns=['learning_path_id','combined_score'], inplace=True)

# Define the feature columns (exclude emp_id and target)
features = [col for col in df.columns if col not in ['emp_id', 'learning_path_name']]
# Use MinMaxScaler to scale features between 0 and 1
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

target = 'learning_path_name'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df['learning_path_name'], test_size=0.2, random_state=42)

# Train the KNN model
knn = KNeighborsClassifier(n_neighbors=15, weights='distance')
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Add predictions to the dataframe
df['predicted_learning_path'] = knn.predict(df[features])

# Save the final dataframe with predictions
df.to_csv('knn_predicted_best_learning_paths.csv', index=False)

# Save the trained KNN model to a file
model_filename = 'knn_learning_path_model.pkl'
joblib.dump(knn, model_filename)

print(f"Model saved to {model_filename}")


Accuracy: 0.19047619047619047
                          precision    recall  f1-score   support

 AI and Machine Learning       0.00      0.00      0.00         0
         Cloud Computing       0.00      0.00      0.00         3
           Cybersecurity       0.00      0.00      0.00         1
            Data Science       0.00      0.00      0.00         5
                  DevOps       0.00      0.00      0.00         1
                Frontend       0.00      0.00      0.00         1
              Full Stack       0.00      0.00      0.00         0
Internet of Things (IoT)       0.00      0.00      0.00         1
      Mobile Development       0.00      0.00      0.00         1
      Project Management       0.00      0.00      0.00         2
       Quality Assurance       0.00      0.00      0.00         0
   Software Architecture       0.00      0.00      0.00         1
    Software Engineering       0.40      1.00      0.57         2
         Web Development       0.67      0.67

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Forest model

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the data
df = pd.read_csv('../DataEngineering/reporting/merged.csv')

# Group by 'emp_id' and 'learning_path_id' for aggregation
grouped_df = df.groupby(['emp_id', 'learning_path_id']).agg(
    avg_completion_rate=('completion_rate', 'mean'),
    avg_test_score_normalized=('test_score_normalized', 'mean'),
    avg_success_rate=('success_rate', 'mean'),
    total_time_spent=('time_spent_in_sec', 'mean')
).reset_index()

# Min-Max normalization
min_time_spent = grouped_df['total_time_spent'].min()
max_time_spent = grouped_df['total_time_spent'].max()

# Calculate the combined score for each learning path
grouped_df['combined_score'] = (
    (grouped_df['avg_completion_rate'] * 0.2) +
    ((
        (grouped_df['total_time_spent'] - min_time_spent) /
        (max_time_spent - min_time_spent)
    )* 0.2) +
    (grouped_df['avg_success_rate'] * 0.15) +
    (grouped_df['avg_test_score_normalized'] * 0.45)
)

# Drop the intermediate columns now that we have combined_score
grouped_df.drop(columns=['avg_completion_rate','total_time_spent', 'avg_success_rate', 'avg_test_score_normalized'], inplace=True)

# Pivot the data to get combined_score and other metrics in columns by learning_path_id
df = grouped_df.pivot(index='emp_id', columns='learning_path_id', values=['combined_score'])

# Flatten the MultiIndex columns for easier access
df.columns = [f'{col[0]}_{col[1]}' for col in df.columns]
df.reset_index(inplace=True)

# Fill NaN values with 0 (if there are any missing values after the merge)
df.fillna(0, inplace=True)

# Load the best learning paths data and join on emp_id
best_learning_paths = pd.read_csv('../DataEngineering/reporting/best_learning_paths.csv')
df = df.merge(best_learning_paths, on='emp_id')
df.drop(columns=['completion_rate','time_spent_in_sec','success_rate','test_score_normalized','combined_score','learning_path_name'], inplace=True)

# Define the feature columns (exclude emp_id and target)
features = [col for col in df.columns if col not in ['emp_id', 'learning_path_id']]
# Use MinMaxScaler to scale features between 0 and 1
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

target = 'learning_path_id'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.3, random_state=71)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Add predictions to the dataframe
df['predicted_learning_path'] = rf.predict(df[features])

# Save the final dataframe with predictions
df.to_csv('rf_predicted_best_learning_paths.csv', index=False)

# Save the trained Random Forest model to a file
model_filename = 'rf_learning_path_model.pkl'
joblib.dump(rf, model_filename)

print(f"Model saved to {model_filename}")


Accuracy: 0.4838709677419355
              precision    recall  f1-score   support

           3       0.50      0.50      0.50         2
           4       0.00      0.00      0.00         0
         100       0.43      1.00      0.60         3
         101       0.50      0.75      0.60         4
         102       0.40      0.50      0.44         4
         103       0.50      0.50      0.50         2
         104       0.75      1.00      0.86         3
         105       0.50      0.50      0.50         2
         107       0.00      0.00      0.00         1
         108       0.00      0.00      0.00         2
         110       1.00      0.50      0.67         2
         112       0.00      0.00      0.00         4
         113       0.00      0.00      0.00         1
         114       0.00      0.00      0.00         1

    accuracy                           0.48        31
   macro avg       0.33      0.38      0.33        31
weighted avg       0.39      0.48      0.42        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model saved to rf_learning_path_model.pkl


In [18]:
# # Load the trained KNN model from file
# knn_loaded = joblib.load('knn_learning_path_model.pkl')

# # Example: Predict on new data (make sure new data is in the same format as training)
# new_employee_data = [
#     [3600, 0.90, 6, 0.85, 2, 1, 0]  # Replace with actual new employee data
# ]

# predicted_learning_path = knn_loaded.predict(new_employee_data)
# print("Predicted Learning Path:", predicted_learning_path)