In [1]:
# Import libraries
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords
import pandas as pd
import re

In [2]:
# Data Preprocessing Function
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [6]:
# Read Data
df = pd.read_csv('data.csv')

In [4]:
df.head()

Unnamed: 0,UnitCode,UnitName,LearningOutcomes,Description,UnitContent
0,ICT501,Business Analysis and Systems Development Appr...,Demonstrate an understanding of business analy...,This unit aims to further develop the knowledg...,This unit covers the following topics: * Syste...
1,ICT502,Applied Information Security Management,Communicate the importance of security of both...,"Information security managers design, build an...",1. Introduction to the Management of Informati...
2,ICT505,Knowledge Management,Critically discuss the role and importance of ...,This unit examines the role of knowledge manag...,The unit will investigate knowledge management...
3,ICT508,Information Technology Project Management,Demonstrate an understanding of the concepts o...,This unit will cover the principles and practi...,The aim of the unit is to provide students wit...
4,ICT513,Data Analytics,Perform a variety of statistical analyses with...,This unit examines topics relevant to data sci...,Topics covered in this unit include: · Descrip...


In [7]:
df.columns

Index(['code', 'name', 'credits', 'masterStatus', 'offerings',
       'administrativeContacts', 'unitCoordinators', 'teachingTeam',
       'fieldOfEducation', 'repeatable', 'unitCategory', 'assessments',
       'owningCollege', 'owningDiscpline', 'quotaEnrolmentRequirements',
       'assessmentOverview', 'unitLevel', 'unitLearningOutcomes',
       'timetableThisUnit', 'specialTopic', 'prerequisites',
       'publishTuitionFees', 'exclusions', 'workIntegratedLearning',
       'otherLearningActivity', 'collaboratingDisciplines', 'quotaRationale',
       'gradingSchema', 'specialUnitType', 'placementProportion',
       'enrolmentRules', 'unitDescriptions', 'timetabledLearningActivities',
       'requisites', 'sourceSystemId', 'id', 'versionNumber', 'startDate',
       'endDate', 'status', 'courseLoopVersionNumber'],
      dtype='object')

In [8]:
#Select the columns
new_df = df[['code', 'name', 'fieldOfEducation', 'unitLearningOutcomes', 'unitDescriptions']]

In [9]:
new_df.head()

Unnamed: 0,code,name,fieldOfEducation,unitLearningOutcomes,unitDescriptions
0,LLB583,Advanced Supervised Legal Research,090000 Society and Culture|090900 Law|090900 L...,UNLO1|1|Define a legal research question and i...,null|null|null|This unit offers students the o...
1,LLB353,Trusts,090000 Society and Culture|090900 Law|090999 L...,UNLO4|4|<p>Demonstrate intellectual skills nee...,Lectures will provide a framework for studying...
2,BIO280,Special Topics in Biological Sciences and Biot...,010000 Natural and Physical Sciences|010900 Bi...,,null|null|null|This unit comprises study in pa...
3,EDN204,Catholic Education: Understanding Jesus and hi...,090000 Society and Culture|091700 Philosophy a...,UNLO3|3|<p>Apply a gospel teaching of Jesus as...,<p>8 x half day workshops </p>|null|null|<p>Th...
4,POL337,Terrorism and Political Violence in South Asia,090000 Society and Culture|090100 Political Sc...,UNLO2|2|Explain and analyse the nature of poli...,The approach to learning in this unit is a com...


In [10]:
#Check for missing values
new_df.isna().any().any()

True

In [11]:
#Missing Values per column
new_df.isna().sum()

code                      0
name                      0
fieldOfEducation          0
unitLearningOutcomes    686
unitDescriptions         51
dtype: int64

In [12]:
#Removing missing values
new_df_clean = new_df.dropna()

In [13]:
new_df_clean.isna().sum()

code                    0
name                    0
fieldOfEducation        0
unitLearningOutcomes    0
unitDescriptions        0
dtype: int64

In [15]:
# Create a copy of the slice
new_df_clean = new_df.copy()

# Combine and Clean Text
new_df_clean['CombinedText'] = new_df_clean['unitLearningOutcomes'].astype(str) + ' ' + new_df_clean['unitDescriptions'].astype(str)
new_df_clean['CleanedText'] = new_df_clean['CombinedText'].apply(clean_text)

In [16]:
# Feature Engineering: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(new_df_clean['CleanedText'])

In [17]:
# Model Building: k-NN with Cosine Similarity
model = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='cosine')
model.fit(tfidf_matrix)

In [24]:
# Setting the Minimum Similarity Threshold to 50%
MIN_SIMILARITY_THRESHOLD = 20

In [29]:
# Recommendation Function
def recommend_units(work_experience):
    cleaned_experience = clean_text(work_experience)
    query_vector = vectorizer.transform([cleaned_experience])
    distances, indices = model.kneighbors(query_vector)
    
    if len(cleaned_experience.split()) < 10:
        return "Please provide a more detailed work experience for better recommendations."
    
    recommendations = []
    for i, index in enumerate(indices[0]):
        similarity = (1 - distances[0][i]) * 100
        
        if similarity < MIN_SIMILARITY_THRESHOLD:
            continue
        
        unit_code = new_df.iloc[index]['code']  
        unit_name = new_df.iloc[index]['name'] 
        field_of_education = new_df.iloc[index]['fieldOfEducation'] 
        
        recommendations.append((unit_code, unit_name, round(similarity, 2)))

    return recommendations if recommendations else "No units found that match your work experience."

# Replace input() with a hardcoded string for testing
user_experience = input("Please enter your work experience: ")
recommended_units = recommend_units(user_experience)

if isinstance(recommended_units, str):
    print(recommended_units)
else:
    print("Units recommended based on your work experience:")
    for unit_code, unit_name, similarity in recommended_units:
        print(f"Unit Code: {unit_code}, Unit Name: {unit_name}, Similarity: {similarity}%")

Please enter your work experience: 
Please provide a more detailed work experience for better recommendations.
