 Testing if I can successfully use a simple NLP toolkit to check answers to extended questions where users submit long pieces of text

# Simple Way using Jaccard Similarity

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def calculate_similarity(answer1, answer2):
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens1 = [word.lower() for word in word_tokenize(answer1) if word.isalnum() and word.lower() not in stop_words]
    tokens2 = [word.lower() for word in word_tokenize(answer2) if word.isalnum() and word.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens1 = [stemmer.stem(token) for token in tokens1]
    stemmed_tokens2 = [stemmer.stem(token) for token in tokens2]

    # Calculate Jaccard similarity
    intersection = set(stemmed_tokens1).intersection(stemmed_tokens2)
    union = set(stemmed_tokens1).union(stemmed_tokens2)
    similarity = len(intersection) / len(union)

    return similarity

# Example usage
correct_answer = "Django is a web framework for Python."
user_answer = "Python is used to create web applications through the Django framework."

similarity = calculate_similarity(correct_answer, user_answer)
print(f"Similarity: {similarity}")


Similarity: 0.375


# Using Cosine Similarity

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

def preprocess_text(text):
    # Tokenize and remove stopwords
    tokens = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stopwords.words('english')]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    return " ".join(stemmed_tokens)

def calculate_cosine_similarity(answer1, answer2):
    # Preprocess the answers
    processed_answer1 = preprocess_text(answer1)
    processed_answer2 = preprocess_text(answer2)

    # Create a CountVectorizer to convert the answers into vectors
    vectorizer = CountVectorizer().fit_transform([processed_answer1, processed_answer2])

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(vectorizer)
    similarity = similarity_matrix[0, 1]

    return similarity

# Example usage
correct_answer = "Divide and Conquer, or sometimes called Divide, Conquer and Combine is a problem solving technique where a problem is firstly divided or broken or decomposed into simpler sub-problems. The sub-problems are then solved independently, often by recursively calling the function to further divide them. Once solved, the sub-problems are combined to solve the original problem."
user_answer1 = "a problem-solving strategy that involves breaking down a complex problem into simpler, more manageable subproblems, solving them independently, and then combining their solutions to solve the original problem."
user_answer2 = "Django is a framework for developing web applications with Python."

similarity1 = calculate_cosine_similarity(correct_answer, user_answer1)
similarity2 = calculate_cosine_similarity(correct_answer, user_answer2)

print(f"Similarity (User Answer 1): {similarity1}")
print(f"Similarity (User Answer 2): {similarity2}")

Similarity (User Answer 1): 0.5448623679425841
Similarity (User Answer 2): 0.0


In [8]:
def sort(arr):
    for i in range (1, len(arr)):
        k = arr[i]
        j = i-1
        while j >= 0 and k < arr[j]:
            arr[j+1] = arr[j]
            j -= 1
        arr[j+1] = k

x = [1,5,7,13,4,10]
sort(x)
print(x)

[1, 4, 5, 7, 10, 13]


# What is going on here?

## Preprocessing the text

### Tokenizing and removing stop words

This breaks the string into a list of words and removes common 'filler words' like a, the, from etc.

### Stemming

Breaks words down into their base form: chcolatey, chocolates, choco all become chocolate.

## Calculating the Similarity

I've used cosine similarity. This essentially turns each of the strings into a vector and we calculate the similarity by the cosine angle between these two vectors