In [1]:
import pandas as pd

# Load data
df = pd.read_csv("/content/FINAL COURSE CATALOG - Sheet1.csv")

# Display first few rows
print(df.head())
df["units"] = df["units"].str.replace(" unit", "", regex=True)



  course_id                                title units  \
0  MST 020A               Early Medieval Culture     4   
1  MST 020B  The Culture of the High Middle Ages     4   
2   MST 098                 Directed Group Study   1-5   
3  MST 098F           Student Facilitated Course   1-4   
4   MST 099     Special Study for Undergraduates   1-5   

                                         description           prerequisites  \
0  Learning Activities: Lecture 3 hour(s), Extens...                     NaN   
1  Learning Activities: Lecture 3 hour(s), Extens...                     NaN   
2  Learning Activities: Variable.\nGrade Mode: Pa...                     NaN   
3  Learning Activities: Variable 1-4 hour(s).\nGr...  Consent of instructor.   
4  Learning Activities: Variable.\nGrade Mode: Pa...                     NaN   

     course_level major  
0  Lower Division   MST  
1  Lower Division   MST  
2  Lower Division   MST  
3  Lower Division   MST  
4  Lower Division   MST  


In [2]:
print(df.isnull().sum())  # Show missing values

# Fill missing values (if applicable)
df.fillna("", inplace=True)  # Replace NaN with empty strings


course_id           0
title               0
units               0
description        26
prerequisites    2881
course_level        0
major               0
dtype: int64


In [3]:
df["title"] = df["title"].str.lower()
df["description"] = df["description"].str.lower()


In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df["course_level_encoded"] = encoder.fit_transform(df["course_level"])
df["major_encoded"] = encoder.fit_transform(df["major"])


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words="english")

# Convert descriptions into numerical vectors
tfidf_matrix = vectorizer.fit_transform(df["description"])

print(tfidf_matrix.shape)  # (num_courses, num_words)


(10274, 4675)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Display similarity for first 5 courses
print(cosine_sim[:5, :5])


[[1.         0.7520084  0.0199826  0.02578652 0.0199826 ]
 [0.7520084  1.         0.02180319 0.0281359  0.02180319]
 [0.0199826  0.02180319 1.         0.98563529 1.        ]
 [0.02578652 0.0281359  0.98563529 1.         0.98563529]
 [0.0199826  0.02180319 1.         0.98563529 1.        ]]


In [7]:
def recommend_courses(course_title, df, cosine_sim):
    # Convert input to lowercase
    course_title = course_title.lower()

    # Check if the course exists in the DataFrame
    if course_title not in df["title"].str.lower().values:
        return f"Course '{course_title}' not found."

    # Find course index
    idx = df[df["title"].str.lower() == course_title].index[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top 5 recommended courses (excluding the first, which is itself)
    recommended_indices = [i[0] for i in sim_scores[1:6]]

    # Return course titles
    return df["title"].iloc[recommended_indices]

# Example usage
print(recommend_courses("acting: the basics: history & practice", df, cosine_sim))


6103            experimental digital cinema i
7842       from german fiction to german film
8509                           ethical eating
9274                iranian society & culture
7862    multiculturalism in german literature
Name: title, dtype: object


In [None]:
from google.colab import files

df.to_csv("course_data.csv", index=False)
files.download("/content/course_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(df["title"].str.lower())


0                     early medieval culture
1        the culture of the high middle ages
2                       directed group study
3                 student facilitated course
4           special study for undergraduates
                        ...                 
10269                   intermediate punjabi
10270                   intermediate punjabi
10271                   intermediate punjabi
10272                    tutoring in punjabi
10273                    tutoring in punjabi
Name: title, Length: 10274, dtype: object


In [8]:
!pip install pandas scikit-learn rapidfuzz sentence-transformers flask flask-cors


Collecting rapidfuzz
  Downloading rapidfuzz-3.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting flask-cors
  Downloading flask_cors-5.0.1-py3-none-any.whl.metadata (961 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (fr