# Automated Essay Scoring
Machine Learning Challenge by [Kaggle](https://www.kaggle.com/competitions/learning-agency-lab-automated-essay-scoring-2/overview)

In [175]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

In [176]:
df = pd.read_csv('data/train.csv')

In [177]:
df.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [178]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17307 entries, 0 to 17306
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   essay_id   17307 non-null  object
 1   full_text  17307 non-null  object
 2   score      17307 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 405.8+ KB


In [179]:
df.describe()

Unnamed: 0,score
count,17307.0
mean,2.948402
std,1.044899
min,1.0
25%,2.0
50%,3.0
75%,4.0
max,6.0


In [180]:
df.isna().sum()

essay_id     0
full_text    0
score        0
dtype: int64

In [181]:
df['score'].value_counts()

score
3    6280
2    4723
4    3926
1    1252
5     970
6     156
Name: count, dtype: int64

## Evaluation using the Quadratic Weighted Kappa

In [182]:
def quadratic_weighted_kappa(y_true, y_pred, min_rating=None, max_rating=None):
    """
    Computes the quadratic weighted kappa.
    """
    if min_rating is None:
        min_rating = min(min(y_true), min(y_pred))
    if max_rating is None:
        max_rating = max(max(y_true), max(y_pred))

    conf_mat = confusion_matrix(y_true, y_pred, labels=range(min_rating, max_rating + 1))
    num_ratings = len(conf_mat)
    num_scored_items = float(len(y_true))

    hist_true = np.histogram(y_true, bins=np.arange(min_rating, max_rating + 2))[0]
    hist_pred = np.histogram(y_pred, bins=np.arange(min_rating, max_rating + 2))[0]

    expected_mat = np.outer(hist_true, hist_pred) / num_scored_items

    weight_mat = np.zeros((num_ratings, num_ratings))
    for i in range(num_ratings):
        for j in range(num_ratings):
            weight_mat[i, j] = ((i - j) ** 2) / ((num_ratings - 1) ** 2)

    kappa = 1.0 - (np.sum(weight_mat * conf_mat) / np.sum(weight_mat * expected_mat))
    return kappa