In [1]:
import pathlib
import zipfile

import pandas as pd

import utils

In [2]:
essay_zip = pathlib.Path.cwd() / 'data' / 'asap-aes.zip'

with zipfile.ZipFile(essay_zip) as zf:
    tsvs = utils.stream_extract(zf, 7, 5, 11)
    train, valid, test = (pd.read_csv(tsv, delimiter='\t', encoding='latin-1') for tsv in tsvs)
    
    csv = next(utils.stream_extract(zf, 4))
    valid_map = (pd.read_csv(csv)
                   .set_index(['prediction_id'])
                   .to_dict()
                   .pop('predicted_score'))

    descr_zip = next(utils.stream_extract(zf, 1))
        
    
with zipfile.ZipFile(descr_zip) as zf:
    descr_xlsx = next(utils.stream_extract(zf, 8))
    descr = pd.read_excel(descr_xlsx)

    readmes = utils.stream_extract(zf, *range(8))
    prompts = [utils.get_prompt(readme) for readme in readmes]

# Descriptions

In [3]:
cols = [col for col in descr.columns if any(string in col for string in ['size', 'has'])]
descr = descr.drop(columns=cols)

domain2_min_max = ['min_domain2_score', 'max_domain2_score']
descr[domain2_min_max] = descr[domain2_min_max].astype("Int64")

descr['prompt'] = prompts

descr

Unnamed: 0,essay_set,type_of_essay,grade_level,min_domain1_score,max_domain1_score,min_domain2_score,max_domain2_score,prompt
0,1,persuasive / narrative / expository,8,2,12,,,"More and more people use computers, but not ev..."
1,2,persuasive / narrative / expository,10,1,6,1.0,4.0,"Censorship in the Libraries\n""All of us can th..."
2,3,source dependent responses,10,0,3,,,Write a response that explains how the feature...
3,4,source dependent responses,10,0,3,,,"Read the last paragraph of the story.\n\n""When..."
4,5,source dependent responses,8,0,4,,,Describe the mood created by the author in the...
5,6,source dependent responses,10,0,4,,,"Based on the excerpt, describe the obstacles t..."
6,7,persuasive / narrative / expository,7,0,30,,,Write about patience. Being patient means that...
7,8,persuasive / narrative / expository,10,0,60,,,We all understand the benefits of laughter. Fo...


# Train

In [4]:
train = train.drop(columns=[col for col in train.columns if 'rater' in col])
train['domain2_score'] = train['domain2_score'].astype("Int64")
train.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,domain2_score
0,1,1,"Dear local newspaper, I think effects computer...",8,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12976 entries, 0 to 12975
Data columns (total 5 columns):
essay_id         12976 non-null int64
essay_set        12976 non-null int64
essay            12976 non-null object
domain1_score    12976 non-null int64
domain2_score    1800 non-null Int64
dtypes: Int64(1), int64(3), object(1)
memory usage: 519.6+ KB


In [6]:
train.describe()

Unnamed: 0,essay_id,essay_set,domain1_score,domain2_score
count,12976.0,12976.0,12976.0,1800.0
mean,10295.395808,4.179485,6.800247,3.333889
std,6309.074105,2.136913,8.970705,0.729103
min,1.0,1.0,0.0,1.0
25%,4438.75,2.0,2.0,3.0
50%,10044.5,4.0,3.0,3.0
75%,15681.25,6.0,8.0,4.0
max,21633.0,8.0,60.0,4.0


# Validation

In [7]:
valid['domain1_score'] = valid['domain1_predictionid'].map(valid_map)
valid['domain2_score'] = valid['domain2_predictionid'].map(valid_map).astype("Int64")
valid = valid.drop(columns=[col for col in valid.columns if 'predictionid' in col])
valid.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,domain2_score
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",7,
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,8,
2,1790,1,"Dear Local newspaper, Have you been spending a...",9,
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",9,
4,1792,1,"Dear newspaper, I strongly believe that comput...",9,


In [8]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4218 entries, 0 to 4217
Data columns (total 5 columns):
essay_id         4218 non-null int64
essay_set        4218 non-null int64
essay            4218 non-null object
domain1_score    4218 non-null int64
domain2_score    600 non-null Int64
dtypes: Int64(1), int64(3), object(1)
memory usage: 169.0+ KB


In [9]:
valid.describe()

Unnamed: 0,essay_id,essay_set,domain1_score,domain2_score
count,4218.0,4218.0,4218.0,600.0
mean,11282.44642,4.123518,6.654576,3.331667
std,6173.633131,2.117188,8.799343,0.639541
min,1788.0,1.0,0.0,1.0
25%,5243.25,2.0,2.0,3.0
50%,10995.5,4.0,3.0,3.0
75%,16852.75,6.0,8.0,4.0
max,21938.0,8.0,50.0,4.0


# Test

In [10]:
test.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_predictionid,domain2_predictionid
0,2383,1,I believe that computers have a positive effec...,2383,
1,2384,1,"Dear @CAPS1, I know some problems have came up...",2384,
2,2385,1,"Dear to whom it @MONTH1 concern, Computers are...",2385,
3,2386,1,"Dear @CAPS1 @CAPS2, @CAPS3 has come to my atte...",2386,
4,2387,1,"Dear Local newspaper, I think that people have...",2387,


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4254 entries, 0 to 4253
Data columns (total 5 columns):
essay_id                4254 non-null int64
essay_set               4254 non-null int64
essay                   4254 non-null object
domain1_predictionid    4254 non-null int64
domain2_predictionid    600 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 166.2+ KB


In [12]:
test.describe()

Unnamed: 0,essay_id,essay_set,domain1_predictionid,domain2_predictionid
count,4254.0,4254.0,4254.0,600.0
mean,11922.108134,4.146685,14460.824636,8378.0
std,6182.143429,2.132264,6896.586714,346.698716
min,2383.0,1.0,2383.0,7779.0
25%,5847.25,2.0,8716.5,8078.5
50%,11605.5,4.0,14605.5,8378.0
75%,17475.75,6.0,20475.75,8677.5
max,22242.0,8.0,25242.0,8977.0
