# Ngram Identification

This file preprocesses the syllabi text further to perform tf/idf vectorization, identify n-grams, and store those n-grams in lists associated with each document to capture meaningful terms/phrases.


### Input:  InstructorRatingsCSV.csv, TextFiles_Combo folder
### Output: NgramIdentification.csv

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re

In [None]:
def preprocess_text(text, custom_stopwords):
    # Convert to lowercase
    text = text.lower()
    # Remove all punctuation except # and -, and remove digits
    text = re.sub(r'[^\w\s#-]', '', text)  # Keep # and -
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # Tokenize
    tokens = text.split()
    # Remove stopwords and tokens of length 1
    tokens = [word for word in tokens if word not in custom_stopwords and len(word) > 1]
    # Lemmatize tokens
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(token) for token in tokens]

    deduplicated_tokens = []
    iterator = ""
    for i, token in enumerate(tokens):
        if token == "dl":
            token = "d2l"       # maintain meaning of token after removing numbers

        if token != iterator:                       # removes bak-to-back repeating tokens
            deduplicated_tokens.append(token)
            iterator = token


    return " ".join(deduplicated_tokens)

In [3]:
# Define custom stopwords (add repetitive terms here)
custom_stopwords = set(stopwords.words('english'))
custom_stopwords.update(["academic", "misconduct", "syllabus", "university", "class", "csci","course", "grade",
                         "instructor", "plagiarism", "na", "page", "cs", "office", "isbn"
                         "am", "pm","am-am", "pm-pm", "-pm", "am-", "eastern", "sep", "sept", "oct", "nov", "dec", "fall", "spring", "summer",
                         "mon", "tue", "wed", "thu", "monday", "tuesday", "wednesday", "thursday", "friday", "dr", "mr", "mrs", "ms"])

# Add plural forms of the custom stopwords
plural_stopwords = {word + "s" for word in custom_stopwords}
more_stopwords = custom_stopwords.union(plural_stopwords)

In [4]:
# Step 1: Load documents
csv_path = "InstructorRatingsCSV.csv"
folder_path = "TextFiles_Combo"

# Load CSV
df = pd.read_csv(csv_path)

names = df[["INSTRUCTOR"]]
df = df[["ID"]]

namesList = []

for name in names["INSTRUCTOR"]:
    namesList.append(name.lower())

full_stopwords = more_stopwords.union(namesList)

# Load text documents
documents = []
for file_name in df["ID"]:
    with open(os.path.join(folder_path, file_name + ".txt"), 'r') as file:
        raw_text = file.read()
        processed_text = preprocess_text(raw_text, full_stopwords)
        documents.append(processed_text)

df["ProcessedDocument"] = documents

In [5]:
# Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["ProcessedDocument"])
terms = vectorizer.get_feature_names_out()

In [6]:
# Step 3: Identify top n-grams for each document
top_n = 25  # Adjust as needed
top_ngrams_per_document = []

for i, document_id in enumerate(df["ID"]):  # Use document ID from the DataFrame
    # Get TF-IDF scores for the current document
    row = tfidf_matrix[i].toarray().flatten()
    top_indices = row.argsort()[-top_n:][::-1]  # Indices of top n-grams
    top_ngrams = [terms[index] for index in top_indices]
    top_ngrams_per_document.append(top_ngrams)
    print(f"Document ID {document_id}: {top_ngrams}")  # Print with document ID

Document ID CSCI-1100-003-Haas: ['digital', 'lab', 'literacy', 'audit', 'work', 'digital citizenship', 'citizenship', 'help', 'delivery', 'assignments', 'technology', 'online', 'delivery modality', 'bucs', 'learning labs', 'brown hall', 'personal learning', 'exit', 'reflect', 'students', 'brown', 'appeal', 'lecture', 'time', 'information']
Document ID CSCI-1100-901-Haas: ['digital', 'lab', 'literacy', 'audit', 'citizenship', 'digital citizenship', 'work', 'assignments', 'help', 'delivery', 'exit', 'technology', 'online', 'personal learning', 'brown hall', 'learning labs', 'delivery modality', 'bucs', 'exit ticket', 'ticket', 'reflect', 'students', 'brown', 'appeal', 'lesson']
Document ID CSCI-1120-901-Hendrix: ['satisfactory', 'discussion', 'students', 'tokens', 'level satisfactory', 'assign letter', 'discussion posts', 'posts', 'student', 'project', 'token', 'letter', 'quiz', 'achieving', 'vba', 'spreadsheet', 'satisfactory completion', 'macros', 'worksheet', 'module students', 'rubri

In [7]:
transactions = top_ngrams_per_document
document_ids = df["ID"].tolist()  # List of document IDs from the CSV

In [8]:
# Create a DataFrame for inspection
expanded_terms_df = pd.DataFrame({
    "ID": df["ID"],
    "Terms": transactions
})

In [9]:
expanded_terms_df.to_csv("NgramIdentification.csv", index=False)