In [1]:
import pandas as pd

data = pd.read_csv('teachers_recommendation_dataset.csv')
data

Unnamed: 0,Teacher_ID,Primary_Subject,Secondary_Subject,Education_Level,Years_of_Experience,Teaching_Style,Certifications,Availability,Language,Student_Rating,Courses_Taught,Is_Research_Active
0,T001,Biology,Biology,Masters,22,Hybrid,TESOL,Full-time,French,4.42,48,False
1,T002,Mathematics,History,Masters,15,Hybrid,PGCE,Full-time,French,4.39,17,True
2,T003,Geography,English,PhD,1,Lecture-based,PGCE,Visiting,German,4.65,36,False
3,T004,Computer Science,Computer Science,Bachelors,3,Interactive,PGCE,Part-time,English,3.37,37,True
4,T005,Philosophy,History,PhD,2,Project-based,M.Ed,Part-time,English,4.44,49,True
5,T006,Geography,Biology,Masters,25,Interactive,PGCE,Full-time,Mandarin,3.06,39,False
6,T007,Mathematics,Biology,Masters,19,Interactive,TESOL,Visiting,French,4.99,43,True
7,T008,Chemistry,History,Bachelors,7,Hybrid,B.Ed,Full-time,Mandarin,4.98,47,False
8,T009,Physics,Geography,PhD,18,Flipped Classroom,M.Ed,Visiting,Spanish,3.44,33,True
9,T010,Physics,Physics,Masters,21,Hybrid,B.Ed,Full-time,French,3.33,21,True


In [3]:
data.shape

(300, 12)

In [3]:
data.describe()

Unnamed: 0,Years_of_Experience,Student_Rating,Courses_Taught
count,300.0,300.0,300.0
mean,16.296667,4.018867,28.196667
std,8.636974,0.606958,13.240179
min,1.0,3.0,5.0
25%,9.75,3.49,16.0
50%,16.0,3.99,29.0
75%,24.0,4.56,40.0
max,30.0,4.99,50.0


In [4]:
data.isnull().sum()

Teacher_ID              0
Primary_Subject         0
Secondary_Subject       0
Education_Level         0
Years_of_Experience     0
Teaching_Style          0
Certifications         58
Availability            0
Language                0
Student_Rating          0
Courses_Taught          0
Is_Research_Active      0
dtype: int64

In [5]:
data.fillna('Unknown', inplace=True)

In [6]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['Primary_Subject', 'Secondary_Subject', 'Education_Level', 
                    'Teaching_Style', 'Certifications', 'Availability', 'Language']

data_encoded = pd.get_dummies(data, columns=categorical_cols)

data_encoded['Is_Research_Active'] = data_encoded['Is_Research_Active'].astype(int)

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = ['Years_of_Experience', 'Student_Rating', 'Courses_Taught']

data_encoded[num_cols] = scaler.fit_transform(data_encoded[num_cols])

In [14]:
import pandas as pd

# Create a DataFrame with correct column names
course_numeric_input = pd.DataFrame([{
    'Years_of_Experience': 10,
    'Student_Rating': 4.5,
    'Courses_Taught': 20
}])

# Now transform using scaler (this will avoid the warning)
scaled_values = scaler.transform(course_numeric_input)

# Use the correct indices
course_profile = {
    'Primary_Subject_Physics': 1,
    'Secondary_Subject_Mathematics': 1,
    'Education_Level_PhD': 1,
    'Teaching_Style_Lecture-based': 1,
    'Certifications_PGCE': 1,
    'Availability_Full-time': 1,
    'Language_English': 1,
    'Years_of_Experience': scaled_values[0][0],
    'Student_Rating': scaled_values[0][1],
    'Courses_Taught': scaled_values[0][2],
    'Is_Research_Active': 1
}


In [15]:
import numpy as np


all_features = data_encoded.columns.drop('Teacher_ID')
query_vector = np.array([course_profile.get(col, 0) for col in all_features])

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Exclude teacher ID column
teacher_vectors = data_encoded.drop(columns=['Teacher_ID']).values
similarities = cosine_similarity([query_vector], teacher_vectors)

# Get top N matches
top_n = 5
top_indices = similarities[0].argsort()[-top_n:][::-1]

# Show recommended teachers
recommended_teachers = data.iloc[top_indices][['Teacher_ID', 'Primary_Subject', 'Years_of_Experience', 'Student_Rating']]
print(recommended_teachers)


    Teacher_ID Primary_Subject  Years_of_Experience  Student_Rating
104       T105         Biology                    8            4.68
71        T072         Physics                    3            4.50
264       T265         Physics                    6            4.69
73        T074         Physics                   29            4.95
168       T169       Chemistry                   16            4.70


In [18]:
def recommend_teachers(course_profile_dict, data_encoded, scaler, data, top_n=5):
    all_features = data_encoded.columns.drop('Teacher_ID')
    query_vector = np.array([course_profile_dict.get(col, 0) for col in all_features])
    teacher_vectors = data_encoded.drop(columns=['Teacher_ID']).values
    similarities = cosine_similarity([query_vector], teacher_vectors)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    return data.iloc[top_indices]

In [19]:
recommend_teachers(course_profile, data_encoded, scaler, data, top_n=5)

Unnamed: 0,Teacher_ID,Primary_Subject,Secondary_Subject,Education_Level,Years_of_Experience,Teaching_Style,Certifications,Availability,Language,Student_Rating,Courses_Taught,Is_Research_Active
104,T105,Biology,Mathematics,PhD,8,Lecture-based,M.Ed,Full-time,English,4.68,29,True
71,T072,Physics,Mathematics,Masters,3,Lecture-based,B.Ed,Full-time,English,4.5,9,True
264,T265,Physics,Mathematics,Bachelors,6,Lecture-based,PGCE,Part-time,French,4.69,40,True
73,T074,Physics,Economics,PhD,29,Lecture-based,Unknown,Full-time,French,4.95,37,True
168,T169,Chemistry,Mathematics,PhD,16,Lecture-based,B.Ed,Part-time,English,4.7,48,True


In [20]:
import joblib

joblib.dump(scaler, 'scaler.pkl')

joblib.dump(data_encoded.columns.tolist(), 'features.pkl')

np.save('teacher_vectors.npy', data_encoded.drop(columns=['Teacher_ID']).values)

data.to_csv('original_teachers_df.csv', index=False)

['scaler.pkl']

['features.pkl']