# Loading Data and handling missing values

In [4]:
import pandas as pd

data = pd.read_csv('assets/synthetic_teachers_dataset.csv')
data.head()

Unnamed: 0,Teacher_ID,Full_Name,Gender,Country,Primary_Subject,Secondary_Subject,Education_Level,Years_of_Experience,Teaching_Style,Certifications,Availability,Language,Student_Rating,Courses_Taught,Is_Research_Active,Email
0,T00001,Brittany Rivera,Male,Germany,Economics,Biology,PhD,4,Project-based,PGCE,Part-time,Mandarin,4.94,82,True,brittany.rivera@example.edu
1,T00002,Ryan Smith,Non-Binary,India,Computer Science,History,Masters,9,Flipped Classroom,Online Certifications,Part-time,Spanish,4.53,5,False,ryan.smith@example.edu
2,T00003,Nicole Miranda,Male,France,Economics,English,Bachelors,23,Flipped Classroom,M.Ed,Part-time,French,3.22,38,False,nicole.miranda@example.edu
3,T00004,Eric Hayes,Non-Binary,USA,History,Chemistry,Bachelors,24,Lecture-based,,Full-time,German,3.42,70,True,eric.hayes@example.edu
4,T00005,Sarah Hernandez,Female,Canada,Computer Science,Physics,PhD,40,Interactive,TESOL,Part-time,French,2.84,75,False,sarah.hernandez@example.edu


In [5]:
data.shape

(10000, 16)

In [12]:
data.value_counts('Country')

Country
Canada       1492
Australia    1470
Germany      1461
UK           1427
India        1402
USA          1394
France       1354
Name: count, dtype: int64

In [6]:
data.describe()

Unnamed: 0,Years_of_Experience,Student_Rating,Courses_Taught
count,10000.0,10000.0,10000.0
mean,20.282,3.746494,52.1288
std,11.630967,0.720949,27.347742
min,1.0,2.5,5.0
25%,10.0,3.13,29.0
50%,20.0,3.74,52.0
75%,30.0,4.37,76.0
max,40.0,5.0,99.0


In [9]:
data.isnull().sum()

Teacher_ID             0
Full_Name              0
Gender                 0
Country                0
Primary_Subject        0
Secondary_Subject      0
Education_Level        0
Years_of_Experience    0
Teaching_Style         0
Certifications         0
Availability           0
Language               0
Student_Rating         0
Courses_Taught         0
Is_Research_Active     0
Email                  0
dtype: int64

In [8]:
data.fillna('Unknown', inplace=True)

In [13]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['Gender', 'Country', 'Primary_Subject', 'Secondary_Subject', 'Education_Level', 
                    'Teaching_Style', 'Certifications', 'Availability', 'Language']

data_encoded = pd.get_dummies(data, columns=categorical_cols)

data_encoded['Is_Research_Active'] = data_encoded['Is_Research_Active'].astype(int)

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = ['Years_of_Experience', 'Student_Rating', 'Courses_Taught']

data_encoded[num_cols] = scaler.fit_transform(data_encoded[num_cols])

In [15]:
import pandas as pd

input_data = {
    'Years_of_Experience': 10,
    'Student_Rating': 4.5,
    'Courses_Taught': 20,
    'Gender': 'Male',   
    'Country': 'Canada' 
}

course_numeric_input = pd.DataFrame([{
    'Years_of_Experience': input_data['Years_of_Experience'],
    'Student_Rating': input_data['Student_Rating'],
    'Courses_Taught': input_data['Courses_Taught']
}])

scaled_values = scaler.transform(course_numeric_input)

gender_ohe = {
    'Gender_Male': 1 if input_data['Gender'] == 'Male' else 0,
    'Gender_Female': 1 if input_data['Gender'] == 'Female' else 0,
    'Gender_Non-Binary': 1 if input_data['Gender'] == 'Non-Binary' else 0
}

countries = ['Canada', 'Australia', 'Germany', 'UK', 'India', 'USA', 'France']
country_ohe = {f'Country_{country}': 1 if input_data['Country'] == country else 0 for country in countries}

course_profile = {
    'Primary_Subject_Physics': 1,
    'Secondary_Subject_Mathematics': 1,
    'Education_Level_PhD': 1,
    'Teaching_Style_Lecture-based': 1,
    'Certifications_PGCE': 1,
    'Availability_Full-time': 1,
    'Language_English': 1,
    'Years_of_Experience': scaled_values[0][0],
    'Student_Rating': scaled_values[0][1],
    'Courses_Taught': scaled_values[0][2],
    'Is_Research_Active': 1
}

course_profile.update(gender_ohe)
course_profile.update(country_ohe)

In [17]:
import numpy as np

all_features = data_encoded.columns.drop(['Teacher_ID', 'Full_Name', 'Email'])
query_vector = np.array([course_profile.get(col, 0) for col in all_features])

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

teacher_vectors = data_encoded.drop(columns=['Teacher_ID', 'Full_Name', 'Email']).values
similarities = cosine_similarity([query_vector], teacher_vectors)

top_n = 10
top_indices = similarities[0].argsort()[-top_n:][::-1]

recommended_teachers = data.iloc[top_indices][['Teacher_ID', 'Full_Name', 'Email', 'Primary_Subject', 'Years_of_Experience', 'Student_Rating']]
print(recommended_teachers)

     Teacher_ID              Full_Name                              Email  \
8193     T08194         Ashley Johnson         ashley.johnson@example.edu   
1269     T01270           Jacob Taylor           jacob.taylor@example.edu   
1581     T01582       Brittany Johnson       brittany.johnson@example.edu   
3857     T03858       George Henderson       george.henderson@example.edu   
4576     T04577  Dr. Jeffrey Myers PhD  dr..jeffrey.myers.phd@example.edu   
7059     T07060           Sheri Spears           sheri.spears@example.edu   
9797     T09798      Sharon Mccullough      sharon.mccullough@example.edu   
7516     T07517         Andrew Spencer         andrew.spencer@example.edu   
1249     T01250          Taylor Carter          taylor.carter@example.edu   
7655     T07656          Michael Davis          michael.davis@example.edu   

     Primary_Subject  Years_of_Experience  Student_Rating  
8193         Physics                   17            3.97  
1269         Physics            

In [19]:
def recommend_teachers(course_profile_dict, data_encoded, scaler, data, top_n=5):
    all_features = data_encoded.columns.drop(['Teacher_ID', 'Full_Name', 'Email'])
    query_vector = np.array([course_profile_dict.get(col, 0) for col in all_features])
    teacher_vectors = data_encoded.drop(columns=['Teacher_ID', 'Full_Name', 'Email']).values
    similarities = cosine_similarity([query_vector], teacher_vectors)
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    return data.iloc[top_indices]

In [20]:
recommend_teachers(course_profile, data_encoded, scaler, data, top_n=10)

Unnamed: 0,Teacher_ID,Full_Name,Gender,Country,Primary_Subject,Secondary_Subject,Education_Level,Years_of_Experience,Teaching_Style,Certifications,Availability,Language,Student_Rating,Courses_Taught,Is_Research_Active,Email
8193,T08194,Ashley Johnson,Non-Binary,Canada,Physics,Mathematics,Bachelors,17,Lecture-based,PGCE,Full-time,English,3.97,85,False,ashley.johnson@example.edu
1269,T01270,Jacob Taylor,Male,Canada,Physics,Mathematics,Masters,15,Lecture-based,M.Ed,Full-time,English,4.26,99,False,jacob.taylor@example.edu
1581,T01582,Brittany Johnson,Male,Canada,English,Mathematics,Masters,4,Lecture-based,PGCE,Full-time,Mandarin,4.41,55,True,brittany.johnson@example.edu
3857,T03858,George Henderson,Male,Canada,History,Mathematics,PhD,16,Interactive,PGCE,Full-time,French,3.51,40,True,george.henderson@example.edu
4576,T04577,Dr. Jeffrey Myers PhD,Male,Germany,Physics,Physics,PhD,1,Lecture-based,M.Ed,Full-time,English,4.74,88,True,dr..jeffrey.myers.phd@example.edu
7059,T07060,Sheri Spears,Non-Binary,Canada,Physics,Mathematics,Bachelors,17,Lecture-based,PGCE,Full-time,French,3.1,51,True,sheri.spears@example.edu
9797,T09798,Sharon Mccullough,Male,Canada,Physics,Economics,PhD,13,Lecture-based,TESOL,Full-time,Mandarin,2.97,31,True,sharon.mccullough@example.edu
7516,T07517,Andrew Spencer,Male,Germany,Physics,Biology,PhD,32,Lecture-based,PGCE,Full-time,Spanish,3.22,14,True,andrew.spencer@example.edu
1249,T01250,Taylor Carter,Non-Binary,Canada,Physics,Mathematics,PhD,13,Lecture-based,PGCE,Visiting,Spanish,3.3,82,True,taylor.carter@example.edu
7655,T07656,Michael Davis,Female,Australia,Physics,Geography,PhD,25,Lecture-based,PGCE,Full-time,English,3.2,84,True,michael.davis@example.edu


In [21]:
import joblib

joblib.dump(scaler, 'scaler.pkl')

joblib.dump(data_encoded.columns.tolist(), 'features.pkl')

np.save('teacher_vectors.npy', data_encoded.drop(columns=['Teacher_ID']).values)

data.to_csv('original_teachers_df.csv', index=False)

['scaler.pkl']

['features.pkl']