# ConceptNet Expansion

This file preprocesses syllabi text, applies tf/idf vectorization, identifies n-grams, submits n-grams to the ConceptNet API to extract additional terms, and stores the results in lists for every document.

### Input:      InstructorRatingsCSV.csv, TextFiles_Combo folder
### Output:     Expansion.csv

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import requests
import time

In [2]:
def preprocess_text(text, custom_stopwords):
    # Convert to lowercase
    text = text.lower()
    # Remove all punctuation except # and -, and remove digits
    text = re.sub(r'[^\w\s#-]', '', text)  # Keep # and -
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # Tokenize
    tokens = text.split()
    # Remove stopwords and tokens of length 1
    tokens = [word for word in tokens if word not in custom_stopwords and len(word) > 1]
    # Lemmatize tokens
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(token) for token in tokens]

    deduplicated_tokens = []
    iterator = ""
    for i, token in enumerate(tokens):
        if token == "dl":
            token = "d2l"

        if token != iterator:
            deduplicated_tokens.append(token)
            iterator = token


    return " ".join(deduplicated_tokens)

In [3]:
# Define custom stopwords (add repetitive terms here)
custom_stopwords = set(stopwords.words('english'))
custom_stopwords.update(["academic", "misconduct", "syllabus", "university", "class", "csci","course", "grade",
                         "instructor", "plagiarism", "na", "page", "cs", "office", "isbn"
                         "am", "pm","am-am", "pm-pm", "-pm", "am-", "eastern", "sep", "sept", "oct", "nov", "dec", "fall", "spring", "summer",
                         "mon", "tue", "wed", "thu", "monday", "tuesday", "wednesday", "thursday", "friday", "dr", "mr", "mrs", "ms"])

# Add plural forms of the custom stopwords
plural_stopwords = {word + "s" for word in custom_stopwords}
more_stopwords = custom_stopwords.union(plural_stopwords)

In [None]:
# Step 1: Load documents
csv_path = "InstructorRatingsCSV.csv"
folder_path = "TextFiles_Combo"

# Load CSV
df = pd.read_csv(csv_path)

names = df[["INSTRUCTOR"]]
df = df[["ID"]]

namesList = []

for name in names["INSTRUCTOR"]:
    namesList.append(name.lower())

full_stopwords = more_stopwords.union(namesList)

# Load text documents
documents = []
for file_name in df["ID"]:
    with open(os.path.join(folder_path, file_name + ".txt"), 'r') as file:
        raw_text = file.read()
        processed_text = preprocess_text(raw_text, full_stopwords)
        documents.append(processed_text)

df["ProcessedDocument"] = documents

In [5]:
# Step 2: TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["ProcessedDocument"])
terms = vectorizer.get_feature_names_out()

In [6]:
# Step 3: Identify top n-grams for each document
top_n = 25  # Adjust as needed
top_ngrams_per_document = []

for i, document_id in enumerate(df["ID"]):  # Use document ID from the DataFrame
    # Get TF-IDF scores for the current document
    row = tfidf_matrix[i].toarray().flatten()
    top_indices = row.argsort()[-top_n:][::-1]  # Indices of top n-grams
    top_ngrams = [terms[index] for index in top_indices]
    top_ngrams_per_document.append(top_ngrams)
    print(f"Document ID {document_id}: {top_ngrams}")  # Print with document ID

Document ID CSCI-1100-003-Haas: ['digital', 'lab', 'literacy', 'audit', 'work', 'digital citizenship', 'citizenship', 'help', 'delivery', 'assignments', 'technology', 'online', 'delivery modality', 'bucs', 'learning labs', 'brown hall', 'personal learning', 'exit', 'reflect', 'students', 'brown', 'appeal', 'lecture', 'time', 'information']
Document ID CSCI-1100-901-Haas: ['digital', 'lab', 'literacy', 'audit', 'citizenship', 'digital citizenship', 'work', 'assignments', 'help', 'delivery', 'exit', 'technology', 'online', 'personal learning', 'brown hall', 'learning labs', 'delivery modality', 'bucs', 'exit ticket', 'ticket', 'reflect', 'students', 'brown', 'appeal', 'lesson']
Document ID CSCI-1120-901-Hendrix: ['satisfactory', 'discussion', 'students', 'tokens', 'level satisfactory', 'assign letter', 'discussion posts', 'posts', 'student', 'project', 'token', 'letter', 'quiz', 'achieving', 'vba', 'spreadsheet', 'satisfactory completion', 'macros', 'worksheet', 'module students', 'rubri

In [7]:
transactions = top_ngrams_per_document
document_ids = df["ID"].tolist()  # List of document IDs from the CSV

In [8]:
# Function to query ConceptNet
def query_conceptnet(term, relation="RelatedTo", limit=5):
    url = f"https://api.conceptnet.io/query?node=/c/en/{term}&rel=/r/{relation}&limit={limit}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        related_terms = set()
        # Extract only terms in English
        for edge in data['edges']:
            end_term = edge['end']['term']
            
            # Check if the term is English and not the original term
            if end_term.startswith('/c/en/') and end_term != f"/c/en/{term}":
                clean_term = end_term.replace("/c/en/", "").replace(" ", "_")
                related_terms.add(clean_term)
        
        return list(related_terms)
    else:
        print(f"Failed to retrieve data for {term}. Status Code: {response.status_code}")
        return []

In [9]:
# Expand n-grams using ConceptNet
def expand_ngrams(top_ngrams_per_document, relations, limit=5):
    expanded_ngrams_per_document = []
    for i, ngrams in enumerate(top_ngrams_per_document):
        expanded_terms = set(ngrams)
        print(f"Processing Document     {ngrams}")

        # Query ConceptNet for each n-gram
        for term in ngrams:
            print(f"Term: {term}")
            for rel in relations:
                print(rel)
                related_terms = query_conceptnet(term, relation=rel, limit=limit)
                expanded_terms.update(related_terms)
                time.sleep(0.2)  # Prevent API rate-limiting
                print(related_terms)

        expanded_ngrams_per_document.append(list(expanded_terms))
        print(f"Document {i + 1} expanded terms: {expanded_terms}\n\n\n\n\n")
    return expanded_ngrams_per_document

In [10]:
# Example: Expand terms using top n-grams
relations_to_query = ["RelatedTo", "Synonym", "PartOf", "UsedFor", "CapableOf", "Causes", "HasProperty", "DefinedAs", "MannerOf", "HasContext", "SimilarTo", "ReceivesAction"]
expanded_terms = expand_ngrams(top_ngrams_per_document, relations=relations_to_query, limit=5)

# Create a DataFrame for inspection
expanded_terms_df = pd.DataFrame({
    "ID": df["ID"],
    "Expanded_Ngrams": expanded_terms
})

Processing Document     ['digital', 'lab', 'literacy', 'audit', 'work', 'digital citizenship', 'citizenship', 'help', 'delivery', 'assignments', 'technology', 'online', 'delivery modality', 'bucs', 'learning labs', 'brown hall', 'personal learning', 'exit', 'reflect', 'students', 'brown', 'appeal', 'lecture', 'time', 'information']
Term: digital
RelatedTo
['digit']
Synonym
[]
PartOf
[]
UsedFor
[]
CapableOf
[]
Causes
[]
HasProperty
[]
DefinedAs
[]
MannerOf
[]
HasContext
['music', 'finance', 'humorous', 'electronics']
SimilarTo
[]
ReceivesAction
[]
Term: lab
RelatedTo
[]
Synonym
[]
PartOf
[]
UsedFor
[]
CapableOf
[]
Causes
[]
HasProperty
[]
DefinedAs
[]
MannerOf
[]
HasContext
['astronomy', 'british', 'legal', 'politics', 'galaxy']
SimilarTo
[]
ReceivesAction
[]
Term: literacy
RelatedTo
['literate', 'literary', 'literature']
Synonym
[]
PartOf
[]
UsedFor
[]
CapableOf
[]
Causes
[]
HasProperty
[]
DefinedAs
[]
MannerOf
[]
HasContext
[]
SimilarTo
[]
ReceivesAction
[]
Term: audit
RelatedTo
[]
Sy

In [11]:
# Display the expanded terms
print(expanded_terms_df)

                           ID  \
0          CSCI-1100-003-Haas   
1          CSCI-1100-901-Haas   
2       CSCI-1120-901-Hendrix   
3    CSCI-1200-001-Desjardins   
4        CSCI-1210-001-Ramsey   
..                        ...   
60  CSCI-5607-001-Bajracharya   
61    CSCI-5757-001-Battleson   
62      CSCI-5927-201-Rezwana   
63  CSCI-5957-002-Bajracharya   
64      CSCI-5989-001-Bennett   

                                      Expanded_Ngrams  
0   [taught_by_educational_system, humorous, gaine...  
1   [taught_by_educational_system, humorous, citiz...  
2   [indicate_position, taught_by_educational_syst...  
3   [taught_by_educational_system, joyful_or_painf...  
4   [lab exercises, taught_by_educational_system, ...  
..                                                ...  
60  [taught_by_educational_system, made_up_of_days...  
61  [pharmaceutical, taught_by_educational_system,...  
62  [interior, taught_by_educational_system, indiv...  
63  [made_up_of_days, taught_by_educational

In [12]:
expanded_terms_df.to_csv("Expansion.csv", index=False)