In [10]:
import os
import sys
import platform
import torch
import pandas as pd
import sklearn as sk
from sentence_transformers import SentenceTransformer, util
import language_tool_python
import pandas as pd
import numpy as np
from collections import Counter
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from textstat import textstat
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')


has_gpu = torch.cuda.is_available()
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("NVIDIA/CUDA GPU is", "available" if has_gpu else "NOT AVAILABLE")
print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
print(f"Target device is {device}")

Python Platform: Windows-10-10.0.22631-SP0
PyTorch Version: 2.3.0+cu118

Python 3.9.19 (main, May  6 2024, 20:12:36) [MSC v.1916 64 bit (AMD64)]
Pandas 2.2.2
Scikit-Learn 1.5.0
NVIDIA/CUDA GPU is available
MPS (Apple Metal) is NOT AVAILABLE
Target device is cuda


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jerry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jerry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jerry\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jerry\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#### Funtions

In [11]:
def sematic_similarity(query, essays):

    mpnet_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    essay_embeddings = mpnet_model.encode(essays, convert_to_tensor=True, device=device)
    query_embeddings = mpnet_model.encode(query, convert_to_tensor=True, device=device)
    similarity_result = util.semantic_search(query_embeddings, essay_embeddings)[0]

    similarity_result.sort(key=lambda x: x['corpus_id'])
    return [i['score'] for i in similarity_result]

In [12]:
# Function to count grammar and spelling errors
def count_errors(text):
    tool = language_tool_python.LanguageTool('en-US')
    matches = tool.check(text)
    spelling_errors = [match for match in matches if match.ruleId.startswith('MORFOLOGIK_RULE')]
    grammar_errors = [match for match in matches if not match.ruleId.startswith('MORFOLOGIK_RULE')]
    return len(spelling_errors), len(grammar_errors)

In [13]:
def extract_features(text):
    # Tokenization
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())
    
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Linguistic features
    avg_word_length = np.mean([len(word) for word in words])
    avg_sentence_length = np.mean([len(word_tokenize(sentence)) for sentence in sentences])
    
    # Lexical diversity (TTR)
    ttr = len(set(words)) / len(words) if words else 0
    
    # Parts of speech
    pos_tags = nltk.pos_tag(words)
    pos_counts = Counter(tag for word, tag in pos_tags)
    
    # Sentiment analysis
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    
    # Structural features
    paragraph_count = text.count('\n\n') + 1
    sentence_count = len(sentences)
    word_count = len(words)
    
    # Readability scores
    flesch_reading_ease = textstat.flesch_reading_ease(text)
    smog_index = textstat.smog_index(text)
    
    # Calculate n-grams
    bigrams = list(ngrams(words, 2))
    trigrams = list(ngrams(words, 3))
    
    # Transition words (expanded list)
    transition_words = set([
        'however', 'therefore', 'thus', 'hence', 'consequently', 'moreover', 'furthermore',
        'additionally', 'alternatively', 'similarly', 'likewise', 'nevertheless', 'nonetheless',
        'for example', 'for instance', 'in addition', 'on the other hand', 'by contrast',
        'in conclusion', 'to summarize', 'in summary', 'accordingly', 'as a result', 'in fact',
        'in other words', 'namely', 'that is', 'such as', 'including', 'particularly', 'especially',
        'specifically', 'notably', 'above all', 'in particular'
    ])
    transition_word_count = sum(1 for word in words if word in transition_words)
    
    # Additional features
    complex_sentence_count = sum(1 for sentence in sentences if sentence.count(',') + sentence.count(';') + sum(1 for word in word_tokenize(sentence) if word in ['and', 'or', 'but']) > 1)
    character_count = len(text)
    
    return {
        'avg_word_length': avg_word_length,
        'avg_sentence_length': avg_sentence_length,
        'ttr': ttr,
        'noun_count': pos_counts.get('NN', 0) + pos_counts.get('NNS', 0) + pos_counts.get('NNP', 0) + pos_counts.get('NNPS', 0),
        'verb_count': pos_counts.get('VB', 0) + pos_counts.get('VBD', 0) + pos_counts.get('VBG', 0) + pos_counts.get('VBN', 0) + pos_counts.get('VBP', 0) + pos_counts.get('VBZ', 0),
        'adjective_count': pos_counts.get('JJ', 0) + pos_counts.get('JJR', 0) + pos_counts.get('JJS', 0),
        'adverb_count': pos_counts.get('RB', 0) + pos_counts.get('RBR', 0) + pos_counts.get('RBS', 0),
        'paragraph_count': paragraph_count,
        'sentence_count': sentence_count,
        'word_count': word_count,
        'flesch_reading_ease': flesch_reading_ease,
        'smog_index': smog_index,
        'unique_bigram_count': len(set(bigrams)),
        'unique_trigram_count': len(set(trigrams)),
        'transition_word_count': transition_word_count,
        'positive_sentiment': sentiment_scores['pos'],
        'neutral_sentiment': sentiment_scores['neu'],
        'negative_sentiment': sentiment_scores['neg'],
        'complex_sentence_count': complex_sentence_count,
        'character_count': character_count
    }


In [18]:
# Extract features for each essay and count grammar and spelling errors
def extract_all_features(df):
    
    group_features = pd.DataFrame()
    for query, group in df.groupby('query'):
        features_list = []
        for index, row in group.iterrows():
            essay = row.essays
            features = extract_features(essay)
            features['spelling_errors'], features['grammar_errors'] = count_errors(essay)
            features_list.append(features)

        features = pd.DataFrame(features_list)
        features['Semantic_Score'] = sematic_similarity(query,group.essays)
        group_features = pd.concat([group_features,features])
    
    return group_features

In [19]:
def train(df):
    X_train = extract_all_features(df)
    y = df.scores
    # Standardize the features (KNN works better with standardized data)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    # X_test = scaler.transform(X_test)
    # Create the KNN classifier with k=3
    knn = KNeighborsClassifier(n_neighbors=1)
    # Train the classifier
    knn.fit(X_train, y)
    return knn

Apply

In [20]:
query = ["""Write an essay commenting on the saying "If you cannot do great things, do small \
            things in a great way." You can cite examples to illustrate your point of view.""",
            """Write an essay commenting on the saying "If you cannot do great things, do small \
            things in a great way." You can cite examples to illustrate your point of view.""",
            """Write an essay commenting on the saying "If you cannot do great things, do small \
            things in a great way." You can cite examples to illustrate your point of view.""",
            """Write an essay commenting on the saying "If you cannot do great things, do small \
            things in a great way." You can cite examples to illustrate your point of view.""",
            """Write an essay commenting on the saying "If you cannot do great things, do small \
            things in a great way." You can cite examples to illustrate your point of view."""]     

scores = [14, 11, 8, 5, 2]

essays = ["""We all remember a time when we spoke proudly about becoming the next Bill Gates, or Stephen Hawking, and it's never wrong to dream big. As time passes by, we gradually realize that most of us are bound to stay ordinary. Yet there's still a way of achieving excellency. As the saying goes, "If you cannot do great things, do small things in a great way."
                A perfect life doesn't have to be so glorious or sparkling as grand plays performed in theatres. The true value lies in our attitude towards tiny daily issues, of which the repairman working in our community is an ideal example. Bicycle-repairing is certainly not the brightest or best-paid profession to many, but this man puts his heart and soul into it and views it as his way of serving for people in the community, which is why he earns high reputation.
                As a generation about to step into the fiercely competitive society, we are haunted by the feeling of worry and helplessness. If we strive for accomplishing remarkable success and becoming world-renowned, we'll probably feel frustrated and disappointed. Only by realizing "Excellency derives from small matters" can we survive and thrive.""", \
        """"If you cannot do great things, do small things in a great way" is the greatest saying I've ever heard. I admire many people. Some of them are heroes who do great things, but some of them are ordinary people. They do small things which seems simple and boring.
                To illustrate it, I show an example first. My Chinese teacher, Miss Chen, is an ordinary senior high school teacher. She teaches everyday. However, in my eyes, she's a great teacher. She really loves her students and her courses are wonderful. She will spend her spare time to prepare a good class. She's always willing to help students. As a result, she's a very popular teacher in our school.
                Miss Chen doesn't do great things, but she tries to make her job great. When you do things wholeheartedly, you are great person. Obviously, not everyone has the chance to be a hero, but when we do our small things in great way, we succeed.
                This is an amazing quality. So we shouldn't complain that we can't do great things. Let's do small things in a great way to be our own hero.""", \
        """A successful great man said: "a soldier who don't want to be a general is not a good soldier". We can find that most of men want to become great man, and do great things! But an army has only one general, most of soldiers can only be a normal soldier. So should the soldiers all go back home, just because they can't become a general? Of course not!
                Everyone has his value in his position. An army can't win battle if has only a general but no soldiers. If you are a soldier, just do your best, stay in your position and everyone will respect you include the general. As my father, although he's working in the government as a normal public servant not an officer, but he did his best in work. He solved lots of questions for his department, and served for hundreds of people a week to help them, he feels that he's successful because his leader and all the people he served said to him: "you are a great man"
                So we just do our best in our work, do more for the others! We are all the great man!""",
        """As show in this saying, Recentally many people, especially young person, think it is certainly do good to do great things, rather than small things. Moreover, they often think do small things has no value and boring
                The saying above tries to criticize the phenomenon that most people want do great things rather than small things. It's really a wrong views. After carefully consideration, I have state that the saying is more biased. It given following reasons. Firstly, There not only great things but also small things in every project. Besides, the small things can do very excellent too, if you can make your minds to do it. Further, If you can't do a small things well, neither great things.
                From what discussed above, we conclude that question quite depends on choice. In my own eyes, no matter what ability we have, how excellent we are. Because if you want walk thousands miles, you have to do every step in you foot. Only if do small things well, do great things in the same time.""", \
        """Nowadays, everybody want do great things, such as student we would rather to elect the hot of sauce rather than study the useful teknologe. But, as show in the eassy that it saying "If you cannot do great things, do small things in a great way".
                There are some reasons we should do some small things. To begin with, nothing is distinguish by great things and small things. To sum up the small thing. It is also a great thing. And the great thing is own to people's opinion. Then, if want to be success, the small things will help to you. Further more, the most of all famous people is grown in small things. Finally, the things must said to yourself. If you unable to do, you should give up quickly.
                In short, it is time to do some things no matter how great things or small things. When we growing up, we are get benefited from those small things. Every is important. Every things is benefit to your future. Only by this way we can successful and more happy."""]

train_data = pd.DataFrame({"essays":essays,"query":query,"scores":scores})
train_data

Unnamed: 0,essays,query,scores
0,We all remember a time when we spoke proudly a...,"Write an essay commenting on the saying ""If yo...",14
1,"""If you cannot do great things, do small thing...","Write an essay commenting on the saying ""If yo...",11
2,"A successful great man said: ""a soldier who do...","Write an essay commenting on the saying ""If yo...",8
3,"As show in this saying, Recentally many people...","Write an essay commenting on the saying ""If yo...",5
4,"Nowadays, everybody want do great things, such...","Write an essay commenting on the saying ""If yo...",2


features_df

In [21]:

train(train_data)
# Predict the labels for the test set
# y_pred = knn.predict(X_test)