In [None]:
# EMPLOYEE - TABLE

# Step 1: Load the data from the CSV file (assumed to be already extracted)
employee_data = pd.read_csv('./staging/Employee.csv')   # change thiss---------------------

# Step 2: Extract relevant columns
cleaned_employee_data = employee_data[['emp_id', 'designation']]

# Step 3: Remove duplicates
cleaned_employee_data = cleaned_employee_data.drop_duplicates(subset='emp_id')

# Step 4: One-Hot Encoding for the designation column
cleaned_employee_data = pd.get_dummies(cleaned_employee_data, columns=['designation'], prefix='designation', drop_first=True)

# Step 5: Convert boolean columns to integers (0, 1)
bool_columns = cleaned_employee_data.columns[1:]  # Assuming the first column is emp_id
cleaned_employee_data[bool_columns] = cleaned_employee_data[bool_columns].astype(int)

# Step 6: Provide information about the cleaned table
print(cleaned_employee_data.info())
print(cleaned_employee_data.head())  # Show the first few rows of the cleaned data

# Optionally, save the cleaned data to a new CSV file
cleaned_employee_data.to_csv('./prep/cleaned_employee_data.csv', index=False)

In [None]:
# COURSE - TABLE

# Step 1: Load the data from the CSV file
courses_data = pd.read_csv('./staging/Course.csv')

# Step 2: Extract relevant columns
cleaned_courses_data = courses_data[['course_id', 'duration', 'difficulty_level']]

# Step 3: Perform ordinal encoding for difficulty_level
difficulty_order = ['BASIC', 'BEGINNER', 'INTERMEDIATE', 'EXPERT']  # Define the order of difficulty levels
cleaned_courses_data['difficulty_level'] = pd.Categorical(cleaned_courses_data['difficulty_level'], 
                                                            categories=difficulty_order, 
                                                            ordered=True)
cleaned_courses_data['difficulty_level'] = cleaned_courses_data['difficulty_level'].cat.codes  # Convert to codes

# Step 4: Convert duration to weeks
def duration_to_weeks(duration):
    if 'months' in duration:
        return int(duration.split()[0]) * 4  # Assuming 1 month = 4 weeks
    elif 'years' in duration:
        return int(duration.split()[0]) * 52  # Assuming 1 year = 52 weeks
    elif 'weeks' in duration:
        return int(duration.split()[0])
    else:
        return 1  # Handle any unexpected format

cleaned_courses_data['duration_in_weeks'] = cleaned_courses_data['duration'].apply(duration_to_weeks)

# Step 1: Calculate the mean and standard deviation
mean_duration = cleaned_courses_data['duration_in_weeks'].mean()
std_duration = cleaned_courses_data['duration_in_weeks'].std()

# Step 2: Apply Z-score normalization
cleaned_courses_data['standardized_duration'] = (cleaned_courses_data['duration_in_weeks'] - mean_duration) / std_duration


# Step 5: Clean the DataFrame by dropping the original duration column
cleaned_courses_data = cleaned_courses_data.drop(columns=['duration'])

# Step 6: Provide information about the cleaned table
print(cleaned_courses_data.info())
print(cleaned_courses_data.head())  # Show the first few rows of the cleaned data

# Optionally, save the cleaned data to a new CSV file
cleaned_courses_data.to_csv('./prep/cleaned_courses_data.csv', index=False)

In [1]:
%pip install scikit-learn pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Step 1: Load Data
data = pd.read_csv('../DataEngineering/reporting/merged.csv')

# Step 2: Data Cleaning
data.dropna(subset=['emp_id', 'course_id', 'learning_path_name'], inplace=True)

# Step 3: Feature Engineering
# Aggregate courses completed per employee
emp_courses = (
    data.groupby('emp_id')
    .agg({
        'course_id': 'nunique',  # Number of unique courses completed
        'completion_rate': 'mean',  # Average completion rate
        'test_score_normalized': 'mean',  # Average test score
        'time_spent_in_sec': 'sum',  # Total time spent
    })
    .reset_index()
)

# Step 4: Encoding categorical variables
encoder = OneHotEncoder(sparse=False)
learning_path_encoded = encoder.fit_transform(data[['learning_path_name']])

# Create DataFrame for encoded features
learning_path_df = pd.DataFrame(learning_path_encoded, columns=encoder.get_feature_names_out())
emp_courses = pd.concat([emp_courses, learning_path_df], axis=1)

# Define features and target
X = emp_courses.drop(['emp_id'], axis=1)
y = data.groupby('emp_id')['learning_path_name'].first()  # Assuming we take the first for simplicity

# Normalize the numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# KNN Implementation
knn = KNeighborsClassifier(n_neighbors=5)  # You can tune n_neighbors
knn.fit(X_train, y_train)

# Predict on test set
y_pred = knn.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

# Get predictions for all employees
emp_courses['predicted_learning_path'] = knn.predict(X_scaled)

# Final Output
best_learning_paths = emp_courses[['emp_id', 'predicted_learning_path']].drop_duplicates()
best_learning_paths.to_csv('./best_learning_paths.csv', index=False)

print(best_learning_paths)

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [10]:
# %%
import random
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
df = pd.read_csv('../DataEngineering/reporting/merged.csv')
 
 
# %%
df.columns
 
# %% [markdown]
# FEATURE ENGINEERING AND NORMALIZING
 
 
# %%
df['raw_score'] = df['completion_rate'] * df['test_score_normalized']
 
# Step 2: Normalize the score
max_score = df['raw_score'].max()
min_score = df['raw_score'].min()
 
# Normalization formula: (score - min) / (max - min) * new_max
df['normalized_score'] = (df['raw_score'] - min_score) / (max_score - min_score)
df.head()
 
# %%
required_columns= [
    'emp_id','emp_name','course_id','course_name','normalized_score'
]
 
df_final=df[required_columns]
df_final.head()
 
len(df_final)
 
# %%
df_final.drop_duplicates
len(df_final)
 
# %%
df_final_agg = df_final.groupby(['emp_id', 'course_name'], as_index=False)['normalized_score'].mean()
 
# Create a pivot table: emp_id as rows, course_name as columns, and normalized_score as values
df_pivot = df_final_agg.pivot(index='emp_id', columns='course_name', values='normalized_score').fillna(0)
 
# Convert the pivot table to a sparse matrix
df_matrix = csr_matrix(df_pivot.values)
 
# Fit the NearestNeighbors model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(df_matrix)
 
# Randomly choose an employee for recommendations
query_index = np.random.choice(df_pivot.shape[0])
query_emp_id = df_pivot.index[query_index]
 
# Get the name of the querying user
query_user_name = df_final[df_final['emp_id'] == query_emp_id]['emp_name'].values[0]
print(f"Query user ID: {query_emp_id} (Name: {query_user_name})")
 
# Get recommendations
distances, indices = model_knn.kneighbors(df_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors=6)
 
# Display nearest employees
print(f'\nNearest Employees for User ID {query_emp_id} (Name: {query_user_name}):\n')
recommended_ids = []
for i in range(len(distances.flatten())):
    if i == 0:
        print('Self Match (Distance = 0):')
    else:
        recommended_id = df_pivot.index[indices.flatten()[i]]
        recommended_ids.append(recommended_id)
        user_name = df_final[df_final['emp_id'] == recommended_id]['emp_name'].values[0]  # Get the name
        print(f'User ID {recommended_id} (Name: {user_name}), with distance of {distances.flatten()[i]:.4f}')
 
# Gather courses from nearest employees
all_courses = set()
user_courses = set(df_final[df_final['emp_id'] == query_emp_id]['course_name'])
 
for emp_id in recommended_ids:
    courses_taken = df_final[df_final['emp_id'] == emp_id]['course_name'].unique()
    all_courses.update(courses_taken)
 
# Determine courses to recommend
unique_courses = all_courses.difference(user_courses)
 
if unique_courses:
    print(f'\nRecommended Courses for User ID {query_emp_id} (Name: {query_user_name}) (not previously taken):\n')
    for course in unique_courses:
        print(course)
else:
    # If no unique courses, suggest any course from the nearest employees
    print(f'\nAll courses have been taken by User ID {query_emp_id} (Name: {query_user_name}). Suggesting courses from nearest employees:\n')
    suggested_courses = list(all_courses)
    for course in suggested_courses:
        print(course)
 
# Optionally calculate RMSE (if needed)
def calculate_rmse(recommended_ids, actual_scores):
    relevant_scores = df_final[df_final['emp_id'].isin(recommended_ids)]
 
    if relevant_scores.empty:
        return float('nan')  # Return NaN if no relevant scores are found
 
    y_true = relevant_scores['normalized_score']
    y_pred = relevant_scores['normalized_score'].mean()  # Using the mean as a simple prediction
 
    rmse = np.sqrt(mean_squared_error(y_true, [y_pred] * len(y_true)))
    return rmse
 
# Calculate RMSE (if desired)
rmse_value = calculate_rmse(recommended_ids, df_final)
print(f'\nRMSE: {rmse_value:.4f}')

Query user ID: JMD001 (Name: Harsha)

Nearest Employees for User ID JMD001 (Name: Harsha):

Self Match (Distance = 0):
User ID JMD196 (Name: Juana Upton), with distance of 0.3378
User ID JMD199 (Name: Colin Friesen PhD), with distance of 0.3956
User ID JMD130 (Name: Jim Crist Sr.), with distance of 0.6333
User ID JMD122 (Name: Mrs. Sharon Jones), with distance of 0.6362
User ID JMD197 (Name: Dr. Duane Rutherford), with distance of 0.6378

Recommended Courses for User ID JMD001 (Name: Harsha) (not previously taken):

IT trainer Fundamentals
Clinical research associate Fundamentals
Pathologist Fundamentals
Commissioning editor Fundamentals
Designer, ceramics/pottery Fundamentals
Chief Financial Officer Fundamentals
Industrial/product designer Fundamentals
Senior tax professional/tax inspector Fundamentals
Aeronautical engineer Fundamentals
Engineer, automotive Fundamentals
Waste management officer Fundamentals

RMSE: 0.1895


In [14]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../DataEngineering/reporting/merged.csv')
# Select relevant columns
features = ['completion_rate', 'test_score_normalized', 'success_rate']
target = 'learning_path_name'

# Drop rows with missing target values
df.dropna(subset=[target], inplace=True)
# Group by emp_id and learning path, calculating mean scores
learning_path_interest = df.groupby(['emp_id', 'learning_path_name'])[features].mean().reset_index()
# Example: Calculating a simple interest score
learning_path_interest['interest_score'] = (
    learning_path_interest['completion_rate'] * 0.5 +
    learning_path_interest['test_score_normalized'] * 0.3 +
    learning_path_interest['success_rate'] * 0.2
)
# Get the learning path with the highest interest score for each employee
best_learning_paths = learning_path_interest.loc[
    learning_path_interest.groupby('emp_id')['interest_score'].idxmax()
]
print(best_learning_paths[['emp_id', 'learning_path_name', 'interest_score']])
best_learning_paths.to_csv('best_learning_paths.csv', index=False)
# One-hot encode the learning_path_name
# df_encoded = pd.get_dummies(df, columns=['learning_path_name'])
from sklearn.model_selection import train_test_split

X = df[features]  # Features
y = df['learning_path_name']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


     emp_id        learning_path_name  interest_score
0    JMD001                  Frontend        0.670000
2    JMD002                Full Stack        0.765714
3    JMD003                Full Stack        0.278997
11   JMD100      Software Engineering        0.435167
17   JMD101  Internet of Things (IoT)        0.376000
..      ...                       ...             ...
597  JMD195              Data Science        0.603000
600  JMD196       Agile Methodologies        0.705000
613  JMD197     Software Architecture        0.531667
617  JMD198        Project Management        0.830000
622  JMD199                  Big Data        0.611000

[103 rows x 3 columns]
Accuracy: 0.013071895424836602
                          precision    recall  f1-score   support

 AI and Machine Learning       0.04      0.11      0.06         9
     Agile Methodologies       0.00      0.00      0.00        11
                Big Data       0.00      0.00      0.00         4
              Blockchain       0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
