In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import nltk
import pandas as pd
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import MinMaxScaler
import scipy
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
train_data = pd.read_csv('/content/drive/MyDrive/346 project/training_set_rel3.tsv', sep='\t', encoding='ISO-8859-1')
columns = ['essay_id', 'essay_set', 'essay', 'domain1_score']
asap = pd.DataFrame(train_data, columns=columns)
sets=asap['essay_set'].unique()
scores=pd.DataFrame(asap,columns=['essay_set','domain1_score'])
scores_grp=scores.groupby(['essay_set'],as_index=False)
essay=pd.DataFrame(sets,columns=['sets'])
essay['counts']=scores_grp.count()['domain1_score']
essay['min']=scores_grp.min()['domain1_score']
essay['max']=scores_grp.max()['domain1_score']
essay['med']=scores_grp.median()['domain1_score']
scores = {}
for es in sets:
    min_es = asap[asap['essay_set'] == es].domain1_score.min()
    max_es =  asap[asap['essay_set'] == es].domain1_score.max()
    scores[es] = (min_es, max_es)

In [5]:
def minmax_scaler(es, score):
    return (score - scores[es][0]) / (scores[es][1] - scores[es][0])

def inverse_scaler(es, score):
    return round(score * (scores[es][1] - scores[es][0]) + scores[es][0])
def scale_dataset(asap):
    for row in range(len(asap)):
        asap.loc[row, 'nscore'] = minmax_scaler(asap.loc[row, 'essay_set'], asap.loc[row, 'domain1_score'])
    return asap
asap = scale_dataset(asap)
def vectorizer(X_train, X_val, X_test):
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    if len(X_val) > 0:
        X_val = vectorizer.transform(X_val)
    X_test = vectorizer.transform(X_test)
    return X_train, X_val, X_test

In [8]:
total_qwk = 0.0
for score in scores:
    data  = asap[asap['essay_set'] == score]
    X_train, X_test, y_train_scaled, y_test_scaled, y_train, y_test = train_test_split(data['essay'], data['nscore'], data['domain1_score'], test_size=0.25, random_state=92)

    X_val=pd.DataFrame()
    X_train_vec, X_val, X_test_vec = vectorizer(X_train, X_val, X_test)

    #svr
    svr = SVR(kernel='linear')
    svr.fit(X_train_vec, y_train_scaled)
    y_pred = svr.predict(X_test_vec)

    y_pred = [inverse_scaler(score, pred) for pred in y_pred]

    qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    total_qwk += qwk

    print('QWK for prompt {} is {:.3f}'.format(score, qwk))

qwk_results = total_qwk / 8
print("SVR: The average QWK {:.3f}".format(qwk_results))

QWK for prompt 1 is 0.706
QWK for prompt 2 is 0.494
QWK for prompt 3 is 0.501
QWK for prompt 4 is 0.731
QWK for prompt 5 is 0.692
QWK for prompt 6 is 0.744
QWK for prompt 7 is 0.682
QWK for prompt 8 is 0.477
SVR: The average QWK 0.628


In [9]:
total_qwk = 0.0
for score in scores:
    data  = asap[asap['essay_set'] == score]
    X_train, X_test, y_train_scaled, y_test_scaled, y_train, y_test = train_test_split(data['essay'], data['nscore'], data['domain1_score'], test_size=0.2, random_state=42)

    X_val=pd.DataFrame()
    X_train_vec, X_val, X_test_vec = vectorizer(X_train, X_val, X_test)

    #brr
    brr = linear_model.BayesianRidge()
    brr.fit(X_train_vec.toarray(), y_train_scaled)
    y_pred = brr.predict(X_test_vec)

    y_pred = [inverse_scaler(score, pred) for pred in y_pred]

    qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    total_qwk += qwk

    print('QWK for prompt {} is {:.3f}'.format(score, qwk))

qwk_results = total_qwk / 8
print("BRR: The average QWK {:.3f}".format(qwk_results))

QWK for prompt 1 is 0.699
QWK for prompt 2 is 0.556
QWK for prompt 3 is 0.526
QWK for prompt 4 is 0.672
QWK for prompt 5 is 0.700
QWK for prompt 6 is 0.751
QWK for prompt 7 is 0.720
QWK for prompt 8 is 0.564
BRR: The average QWK 0.649


In [10]:
total_qwk = 0.0
for score in scores:
    data  = asap[asap['essay_set'] == score]
    X_train, X_test, y_train_scaled, y_test_scaled, y_train, y_test = train_test_split(data['essay'], data['nscore'], data['domain1_score'], test_size=0.2, random_state=42)

    X_val=pd.DataFrame()
    X_train_vec, X_val, X_test_vec = vectorizer(X_train, X_val, X_test)

    #xgb
    xgb = XGBRegressor(n_estimators=800, seed=42, learning_rate = 0.015, max_depth=5, eval_metric='rmse')
    xgb.fit(X_train_vec, y_train_scaled)
    y_pred = xgb.predict(X_test_vec)

    y_pred = [inverse_scaler(score, pred) for pred in y_pred]

    qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    total_qwk += qwk

    print('QWK for prompt {} is {:.3f}'.format(score, qwk))

qwk_results = total_qwk / 8
print("XGB: The average QWK {:.3f}".format(qwk_results))

QWK for prompt 1 is 0.771
QWK for prompt 2 is 0.613
QWK for prompt 3 is 0.610
QWK for prompt 4 is 0.746
QWK for prompt 5 is 0.712
QWK for prompt 6 is 0.769
QWK for prompt 7 is 0.730
QWK for prompt 8 is 0.560
XGB: The average QWK 0.689


In [6]:
import nltk
import string
import re

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import words

def num_chars_essay(text):
    return len(list(text))

def num_puncts_essay(text):
    return len([char for char in list(text) if char in string.punctuation])

def num_words_essay(text):
    return len(word_tokenize(text))

def num_unique_words_essay(text):
    return len(set(word_tokenize(text)))

def num_sents_essay(text):
    return len(sent_tokenize(text))

def num_numbers_essay(text):
    return len(re.sub("[^0-9]", "", text))

def num_correct_words(text):
    correct_words = words.words()
    return len(list(set(correct_words) & set(word_tokenize(text)))) / num_words_essay(text)

def lexical_features(data):

    asap = pd.DataFrame(data)
    asap['num_chars_essay'] = asap['essay'].apply(num_chars_essay)
    asap['num_puncts_essay'] = asap['essay'].apply(num_puncts_essay)
    asap['num_words_essay'] = asap['essay'].apply(num_words_essay)
    asap['num_unique_words_essay'] = asap['essay'].apply(num_unique_words_essay)
    asap['num_numbers_essay'] = asap['essay'].apply(num_numbers_essay)
    asap['num_sents_essay'] = asap['essay'].apply(num_sents_essay)
    return asap


In [7]:
asap = lexical_features(asap)
columns = ['num_chars_essay', 'num_puncts_essay', 'num_words_essay', 'num_unique_words_essay', 'num_numbers_essay', 'num_sents_essay']
scaler = MinMaxScaler()
asap[columns] = scaler.fit_transform(asap[columns])


In [8]:
def svr_fn(asap_train, asap_val, asap_test):
    total_qwk = 0.0
    print(' Prompt   Val    Test ')

    for prompt in scores:
        X_train  = asap_train[asap_train['essay_set'] == prompt]
        X_val = asap_val[asap_val['essay_set'] == prompt]
        X_test = asap_test[asap_test['essay_set'] == prompt]

        # TF-IDF
        X_train_vec, X_val_vec, X_test_vec = vectorizer(X_train['essay'], X_val['essay'], X_test['essay'])

        # Lexical features
        columns = ['num_chars_essay', 'num_puncts_essay', 'num_words_essay', 'num_unique_words_essay', 'num_numbers_essay', 'num_sents_essay']

        # Scale
        scaler = MinMaxScaler()
        X_train_features = pd.DataFrame(lexical_features(X_train), columns=columns)
        X_train_features[columns] = scaler.fit_transform(X_train_features[columns])

        X_val_features = pd.DataFrame(lexical_features(X_val), columns=columns)
        X_val_features[columns] = scaler.fit_transform(X_val_features[columns])

        X_test_features = pd.DataFrame(lexical_features(X_test), columns=columns)
        X_test_features[columns] = scaler.fit_transform(X_test_features[columns])

        # Merge both features (TF-IDF and lexical)
        X_train_all = scipy.sparse.hstack([X_train_vec, X_train_features])
        X_val_all = scipy.sparse.hstack([X_val_vec, X_val_features])
        X_test_all = scipy.sparse.hstack([X_test_vec, X_test_features])

        #svr
        svr = SVR(kernel='linear')
        svr.fit(X_train_all, X_train['nscore'].tolist())
        y_pred_val = svr.predict(X_val_all) # Validation
        y_pred_test = svr.predict(X_test_all) # Test

        # invert y_pred into the essay set scoring range
        y_pred_val = [inverse_scaler(prompt, pred) for pred in y_pred_val]
        y_pred_test = [inverse_scaler(prompt, pred) for pred in y_pred_test]

        val_qwk = cohen_kappa_score(X_val['domain1_score'].tolist(), y_pred_val, weights='quadratic')
        qwk = cohen_kappa_score(X_test['domain1_score'].tolist(), y_pred_test, weights='quadratic')
        total_qwk += qwk
        print('  {}       {:.3f}  {:.3f} '.format(prompt, val_qwk, qwk))
    qwk_results = total_qwk / 8
    print("SVR: The average QWK {:.3f}".format(qwk_results))
train, test = train_test_split(asap,test_size=0.25, random_state=42)
svr_fn(train, test, test)

 Prompt   Val    Test 
  1       0.812  0.812 
  2       0.672  0.672 
  3       0.533  0.533 
  4       0.755  0.755 
  5       0.742  0.742 
  6       0.778  0.778 
  7       0.731  0.731 
  8       0.704  0.704 
SVR: The average QWK 0.716


In [24]:
def brr_fn(asap_train, asap_val, asap_test):
    total_qwk = 0.0
    print(' Prompt   Val    Test ')

    for prompt in scores:
        X_train  = asap_train[asap_train['essay_set'] == prompt]
        X_val = asap_val[asap_val['essay_set'] == prompt]
        X_test = asap_test[asap_test['essay_set'] == prompt]

        X_train_vec, X_val_vec, X_test_vec = vectorizer(X_train['essay'], X_val['essay'], X_test['essay'])

        columns = ['num_chars_essay', 'num_puncts_essay', 'num_words_essay', 'num_unique_words_essay', 'num_numbers_essay', 'num_sents_essay']


        X_train_features = pd.DataFrame(lexical_features(X_train), columns=columns)
        X_val_features = pd.DataFrame(lexical_features(X_val), columns=columns)
        X_test_features = pd.DataFrame(lexical_features(X_test), columns=columns)

        X_train_all = scipy.sparse.hstack([X_train_vec, X_train_features])
        X_val_all = scipy.sparse.hstack([X_val_vec, X_val_features])
        X_test_all = scipy.sparse.hstack([X_test_vec, X_test_features])

        #brr
        brr = linear_model.BayesianRidge()
        brr.fit(X_train_all.toarray(), X_train['nscore'].tolist())
        y_pred_val = brr.predict(X_val_all) # Validation
        y_pred_test = brr.predict(X_test_all) # Test

        y_pred_val = [inverse_scaler(prompt, pred) for pred in y_pred_val]
        y_pred_test = [inverse_scaler(prompt, pred) for pred in y_pred_test]

        val_qwk = cohen_kappa_score(X_val['domain1_score'].tolist(), y_pred_val, weights='quadratic')
        qwk = cohen_kappa_score(X_test['domain1_score'].tolist(), y_pred_test, weights='quadratic')
        total_qwk += qwk
        print('  {}       {:.3f}  {:.3f} '.format(prompt, val_qwk, qwk))
    qwk_results = total_qwk / 8
    print("BRR: The average QWK {:.3f}".format(qwk_results))
train, test = train_test_split(asap,test_size=0.25, random_state=42)
brr_fn(train, test, test)

 Prompt   Val    Test 
  1       0.817  0.817 
  2       0.686  0.686 
  3       0.645  0.645 
  4       0.770  0.770 
  5       0.784  0.784 
  6       0.814  0.814 
  7       0.777  0.777 
  8       0.741  0.741 
BRR: The average QWK 0.754


In [25]:
def xgboost_fn(asap_train, asap_val, asap_test):

    total_qwk = 0.0
    print(' Prompt   Val    Test ')


    for prompt in scores:
        X_train  = asap_train[asap_train['essay_set'] == prompt]
        X_val = asap_val[asap_val['essay_set'] == prompt]
        X_test = asap_test[asap_test['essay_set'] == prompt]

        X_train_vec, X_val_vec, X_test_vec = vectorizer(X_train['essay'], X_val['essay'], X_test['essay'])

        columns = ['num_chars_essay', 'num_puncts_essay', 'num_words_essay', 'num_unique_words_essay', 'num_numbers_essay', 'num_sents_essay']

        X_train_features = pd.DataFrame(lexical_features(X_train), columns=columns)
        X_val_features = pd.DataFrame(lexical_features(X_val), columns=columns)
        X_test_features = pd.DataFrame(lexical_features(X_test), columns=columns)

        X_train_all = scipy.sparse.hstack([X_train_vec, X_train_features])
        X_val_all = scipy.sparse.hstack([X_val_vec, X_val_features])
        X_test_all = scipy.sparse.hstack([X_test_vec, X_test_features])

        #xgb
        xgb = XGBRegressor(n_estimators=800, seed=42, learning_rate = 0.015, max_depth=5, eval_metric='rmse')
        xgb.fit(X_train_all, X_train['nscore'].tolist()) # Training
        y_pred_val = xgb.predict(X_val_all) # Validation
        y_pred_test = xgb.predict(X_test_all) # Test

        y_pred_val = [inverse_scaler(prompt, pred) for pred in y_pred_val]
        y_pred_test = [inverse_scaler(prompt, pred) for pred in y_pred_test]

        val_qwk = cohen_kappa_score(X_val['domain1_score'].tolist(), y_pred_val, weights='quadratic')
        qwk = cohen_kappa_score(X_test['domain1_score'].tolist(), y_pred_test, weights='quadratic')
        total_qwk += qwk

        print('  {}       {:.3f}  {:.3f} '.format(prompt, val_qwk, qwk))
    qwk_results = total_qwk / 8
    print("XGB: The average QWK {:.3f}".format(qwk_results))
train, test = train_test_split(asap,test_size=0.25, random_state=42)
xgboost_fn(train, test, test)

 Prompt   Val    Test 
  1       0.815  0.815 
  2       0.687  0.687 
  3       0.673  0.673 
  4       0.748  0.748 
  5       0.802  0.802 
  6       0.808  0.808 
  7       0.772  0.772 
  8       0.700  0.700 
XGB: The average QWK 0.751
