# AcademicConnect

The goal of this project is to create a recommendation system that recommend students to students, students to professors, professors to professors, and professors to students. 

### Designing The Model

In [197]:
### Academic Conntect
import pandas as pd
import tensorflow as tf
import numpy as np
from transformers import TFAutoModel, AutoTokenizer
import pickle


In [198]:
# Read university dataset 
uni_data = pd.read_excel("university_data.xlsx")
uni_data.head()

Unnamed: 0,Student GUID,Name,Research Interests,University Field
0,c1c5fc27-048a-4fad-b32a-57b6613f5c6d,Daniel Cain,"Photonics, Cosmology, Theoretical Physics, Exp...",Physics
1,ead3d7a5-bddc-4ad1-ab55-4db006731802,Amy Potter,"Cognitive Psychology, Developmental Psychology...",Psychology
2,c6f1e6d1-21fe-4daa-a022-ff9e0f4fd957,Jessica Collins,"Materials Science, Physical Chemistry, Inorgan...",Chemistry
3,3e19f76f-46b4-46c4-a489-36053fd8d79e,Maria Singh,"Economic History, History of Science, Military...",History
4,31bbb063-8dae-4e81-97c4-456e8df9af33,James Thomas,"Geometry, Mathematical Physics, Statistics, Al...",Mathematics


In [199]:
# Create a raw version on university_data to be used later for testing single input
uni_data_raw = pd.read_excel("university_data.xlsx")


In [200]:
from sklearn.preprocessing import LabelEncoder

# Cleaning up Research Interests feature
uni_data['Research Interests'] = uni_data['Research Interests'].str.lower().str.replace(r'[^\w\s]+', '')

# Cleaning up "University Field" feature
# uni_data["University Field"] = uni_data["University Field"].str.lower().str.replace(r'[^\w\s]+', '')
# uni_data.head()
encoder = LabelEncoder()
uni_data['University Field'] = encoder.fit_transform(uni_data['University Field'])

In [201]:
# Save the encoder
with open('models/encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [202]:
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFAutoModel.from_pretrained('bert-base-uncased')

# Define a function to tokenize the "Research Interests" and run the tokens through the model
def embed_text(texts):
    inputs = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=512, return_tensors="tf")
    outputs = model(inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

# Embed 'Research Interests'
embeddings = embed_text(uni_data['Research Interests'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [203]:
# Save tokenizer
with open('models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save model
with open('models/model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save enbeddings
with open('models/embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)



In [204]:
embeddings

array([[ 0.35122058,  0.2672667 , -0.17760639, ..., -0.17172086,
        -0.18274336,  0.6307969 ],
       [ 0.13852608,  0.29572952, -0.7052915 , ..., -0.30475768,
        -0.17106837,  0.39814553],
       [ 0.34270325,  0.21112193, -0.3461482 , ..., -0.23749661,
        -0.01269638,  0.6580549 ],
       ...,
       [-0.03102134, -0.03761458, -0.58762026, ..., -0.3957855 ,
        -0.20575732,  0.42250288],
       [ 0.49557954,  0.16372317, -0.3434162 , ..., -0.19094762,
         0.10214731,  0.82497257],
       [ 0.49353784,  0.18878661, -0.33193555, ..., -0.21652126,
        -0.00542787,  0.8533199 ]], dtype=float32)

In [205]:
from sklearn.preprocessing import StandardScaler

# Scale embeddings and the encoded field
e_scaler = StandardScaler()
f_scaler = StandardScaler()

embeddings_scaled = e_scaler.fit_transform(embeddings)
field_scaled = f_scaler.fit_transform(uni_data[['University Field']])

# Combine embeddings with the university field
combined_features = np.hstack((embeddings_scaled, field_scaled))

In [206]:
# Look at combined_features
combined_features

array([[ 0.75538945,  0.65054816,  1.11638272, ..., -0.42444006,
        -0.2310688 ,  1.20325223],
       [-0.3273685 ,  0.83481163, -1.62190056, ..., -0.35198435,
        -1.22975802,  1.55066157],
       [ 0.71203053,  0.28707665,  0.2417793 , ...,  0.63088262,
        -0.11406002, -1.22861311],
       ...,
       [-1.19047892, -1.32320082, -1.01127648, ..., -0.56726623,
        -1.12520063, -0.88120378],
       [ 1.49027371, -0.01977478,  0.25595617, ...,  1.34361005,
         0.602458  , -0.18638511],
       [ 1.47988009,  0.14248163,  0.31553206, ...,  0.67599148,
         0.72414291, -0.18638511]])

In [207]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(combined_features)

# Create a DataFrame to view similarity scores
similarity_df = pd.DataFrame(similarity_matrix, index=uni_data['Student GUID'], columns=uni_data['Student GUID'])

In [208]:
# Create a function to get the 5 top matches, excluding the student being examined
def get_top_matches(student_guid, similarity_df, uni_data, top_n=5):
    scores = similarity_df.loc[student_guid]
    top_matches = scores.sort_values(ascending=False)[1:top_n+1]  # exclude matching with themselves
    df = pd.DataFrame(uni_data)
    results = df[df['Student GUID'].isin(top_matches.index)]
    results.loc[:, "Match"] = top_matches.values
    return results

In [210]:
# Get top matches for a student/professor
top_5 = get_top_matches(uni_data["Student GUID"][0], similarity_df, uni_data)
top_5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results.loc[:, "Match"] = top_matches.values


Unnamed: 0,Student GUID,Name,Research Interests,University Field,Match
137,ac2087b4-ef74-4b5e-abad-5302f9d7e368,Michael West,"photonics, nuclear physics, astrophysics, expe...",8,0.98804
1002,adb2fcac-7d96-47ec-8f89-29a6ec92b599,Kurt Jones,"photonics, condensed matter physics, cosmology...",8,0.987624
1837,7aeaf1f2-d2a4-4176-abdf-96212da216d4,Lisa Hogan,"photonics, particle physics, condensed matter ...",8,0.984967
5562,149b3317-e913-4008-bd23-7a2f445b6f0b,Devon Davis,"photonics, theoretical physics, astrophysics, ...",8,0.984064
6853,a70f1a6e-2bc8-4060-b3a4-f0c3045e769c,Samuel Norton,"photonics, quantum mechanics, cosmology, astro...",8,0.982235


In [211]:
uni_data["Student GUID"][0]

'c1c5fc27-048a-4fad-b32a-57b6613f5c6d'

### Testing comparing a "new" student to the dataset

In [212]:
# Make a clone of the first student
test_student = uni_data_raw.loc[0,:]
# Change the `Student GUID` to a random UID
test_student['Student GUID'] = "a2e9dc4f-0b8c-4b3d-a6ae-8fbb8836e6e8"
test_student

Student GUID                       a2e9dc4f-0b8c-4b3d-a6ae-8fbb8836e6e8
Name                                                        Daniel Cain
Research Interests    Photonics, Cosmology, Theoretical Physics, Exp...
University Field                                                Physics
Name: 0, dtype: object

In [213]:
# Clean up the data in test_student
test_student['Research Interests'] = test_student["Research Interests"].lower().replace(r'[^\w\s]+', '')
test_student['University Field'] = encoder.transform([test_student['University Field']])

In [214]:
# Check test_student, "Physics" should have been encoded as "8"
test_student

Student GUID                       a2e9dc4f-0b8c-4b3d-a6ae-8fbb8836e6e8
Name                                                        Daniel Cain
Research Interests    photonics, cosmology, theoretical physics, exp...
University Field                                                      8
Name: 0, dtype: object

In [215]:
# Tokenize and embed the test_student data
test_input = tokenizer(test_student["Research Interests"], padding=True, truncation=True, max_length=512, return_tensors="tf")
test_output = model(test_input)
test_embedding = test_output.last_hidden_state[:, 0, :].numpy()


In [216]:
# Scale the embedding and the field features
test_embedding_scaled = e_scaler.transform(test_embedding)
test_field_scaled = f_scaler.transform([test_student[['University Field']]])

# Combine embeddings with scaled field
test_student_combined_features = np.hstack((test_embedding_scaled, test_field_scaled))



In [219]:
# Calculate cosine similarity between test_student_combined_features and combined_features
cos_similarities = cosine_similarity(test_student_combined_features.reshape(1, -1), combined_features)

In [222]:
len(cos_similarities)

1

In [223]:
# Find the index of the row with the highest cosine similarity
best_match_index = np.argmax(cos_similarities)

# Get the best match from combined_features
best_match_features = combined_features[best_match_index]

# You can also calculate the similarity score with the best match
best_similarity_score = cos_similarities[0, best_match_index]



In [231]:
# Check that the best match lines up
uni_data.iloc[best_match_index], test_student

(Student GUID                       c1c5fc27-048a-4fad-b32a-57b6613f5c6d
 Name                                                        Daniel Cain
 Research Interests    photonics, cosmology, theoretical physics, exp...
 University Field                                                      8
 Name: 0, dtype: object,
 Student GUID                       a2e9dc4f-0b8c-4b3d-a6ae-8fbb8836e6e8
 Name                                                        Daniel Cain
 Research Interests    photonics, cosmology, theoretical physics, exp...
 University Field                                                      8
 Name: 0, dtype: object)