In [1]:
import pandas as pd
import csv
from pathlib import Path

# Missing rows when loading data with pandas

In [2]:
raw_pd = pd.read_csv("raw.tsv", sep="\t", index_col=0)

In [3]:
print(raw_pd.shape)
raw_pd.head()

(9512, 1)


Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
Acephalous-Cant-believe_4_47,I can't believe I wrote all that last year.
Acephalous-Cant-believe_83_354,Because I've been grading all damn day and am ...
Acephalous-Cant-believe_355_499,"However, when I started looking through my arc..."
Acephalous-Cant-believe_500_515,What do I mean?
Acephalous-Cant-believe_517_626,The posts I consider foundational to my curren...


### Compare against loading data with loop

In [4]:
raw_loop = pd.DataFrame(columns=["text"])
with open("raw.tsv", "r") as f:
    for line in f:
        i, text  = line.split("\t")
        raw_loop.loc[i] = text.strip()
raw_loop.index.rename("id", inplace=True)

In [5]:
print(raw_loop.shape)
raw_loop.head()

(10549, 1)


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
id,sentence
Acephalous-Cant-believe_4_47,I can't believe I wrote all that last year.
Acephalous-Cant-believe_83_354,Because I've been grading all damn day and am ...
Acephalous-Cant-believe_355_499,"However, when I started looking through my arc..."
Acephalous-Cant-believe_500_515,What do I mean?


# Identify missing rows

In [6]:
missing  = list(set(raw_loop.index) - set(raw_pd.index))
print(len(missing))
missing[:10]

1037


['guidedogs1_2405_2449',
 'IFAW1_4515_4571',
 'defenders5_31_47',
 'wsj_0187_377_532',
 'NWF1_4379_4549',
 'AMC2_1323_1483',
 'marine1_36_48',
 'captured_moments_5506_5538',
 'NWF1_2524_2601',
 'wildelifewatch1_1608_1773']

Manual inspection showed that there is a problem with missing quotes for escaping.

# Reformat raw data

In [7]:
raw_loop.index.rename("id", inplace=True)
raw_loop.drop("id", axis=0, inplace=True)

In [8]:
raw_loop.to_csv("raw_refined.csv", quoting=csv.QUOTE_ALL)

# Try if new data format is easy to load

In [9]:
raw = pd.read_csv("raw_refined.csv", index_col=0)
print(raw.shape)
raw.head()

(10548, 1)


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
Acephalous-Cant-believe_4_47,I can't believe I wrote all that last year.
Acephalous-Cant-believe_83_354,Because I've been grading all damn day and am ...
Acephalous-Cant-believe_355_499,"However, when I started looking through my arc..."
Acephalous-Cant-believe_500_515,What do I mean?
Acephalous-Cant-believe_517_626,The posts I consider foundational to my curren...


# Refining reader ratings

In [10]:
reader = pd.read_csv("reader.tsv", sep="\t", index_col=0)
reader.head()

Unnamed: 0_level_0,Arousal,Dominance,Valence,sd.Arousal,sd.Dominance,sd.Valence,freq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
110CYL068_1036_1079,3.2,3.0,3.0,0.4,0.0,0.0,5
110CYL068_1079_1110,3.0,2.6,2.6,0.632456,0.489898,0.489898,5
110CYL068_1110_1127,2.333333,2.333333,2.0,0.471405,0.471405,1.414214,3
110CYL068_1127_1130,3.0,3.0,3.0,0.0,0.0,0.0,2
110CYL068_1137_1188,3.0,3.4,3.6,0.632456,0.489898,0.8,5


In [11]:
reader = reader.rename(columns={"Valence": "V", "Arousal": "A", "Dominance": "D",
                       "sd.Arousal": "stdA", "sd.Valence": "stdV", "sd.Dominance": "stdD",
                       "freq": "N"})
reader = reader[["V", "A", "D", "stdV", "stdA", "stdD", "N"]]
reader = reader.round(2)
reader.to_csv("reader_refined.csv", quoting=csv.QUOTE_ALL )


In [12]:
# Test if we can retrieve it easily
reader = pd.read_csv("reader_refined.csv", index_col=0)

In [13]:
print(reader.shape)
reader.head()

(10325, 7)


Unnamed: 0_level_0,V,A,D,stdV,stdA,stdD,N
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
110CYL068_1036_1079,3.0,3.2,3.0,0.0,0.4,0.0,5
110CYL068_1079_1110,2.6,3.0,2.6,0.49,0.63,0.49,5
110CYL068_1110_1127,2.0,2.33,2.33,1.41,0.47,0.47,3
110CYL068_1127_1130,3.0,3.0,3.0,0.0,0.0,0.0,2
110CYL068_1137_1188,3.6,3.0,3.4,0.8,0.63,0.49,5


# Refining writer ratings

In [14]:
writer = pd.read_csv("writer.tsv", sep="\t", index_col=0)

In [15]:
print(writer.shape)
writer.head()

(10279, 7)


Unnamed: 0_level_0,Arousal,Dominance,Valence,sd.Arousal,sd.Dominance,sd.Valence,freq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
110CYL068_1036_1079,2.8,3.4,3.0,0.979796,0.489898,0.0,5
110CYL068_1079_1110,3.2,3.0,3.0,0.4,0.0,0.0,5
110CYL068_1127_1130,3.0,3.0,3.0,0.0,0.0,0.0,5
110CYL068_1137_1188,3.0,3.0,3.25,0.707107,0.0,0.433013,4
110CYL068_1189_1328,3.4,3.2,3.4,0.489898,0.4,0.489898,5


In [16]:
writer = writer.rename(columns={"Valence": "V", "Arousal": "A", "Dominance": "D",
                       "sd.Arousal": "stdA", "sd.Valence": "stdV", "sd.Dominance": "stdD",
                       "freq": "N"})
writer = writer[["V", "A", "D", "stdV", "stdA", "stdD", "N"]]
writer = writer.round(2)
writer.to_csv("writer_refined.csv", quoting=csv.QUOTE_ALL )

In [17]:
# Test if we can retrieve it easily
writer = pd.read_csv("writer_refined.csv", index_col=0)

In [18]:
print(writer.shape)
writer.head()

(10279, 7)


Unnamed: 0_level_0,V,A,D,stdV,stdA,stdD,N
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
110CYL068_1036_1079,3.0,2.8,3.4,0.0,0.98,0.49,5
110CYL068_1079_1110,3.0,3.2,3.0,0.0,0.4,0.0,5
110CYL068_1127_1130,3.0,3.0,3.0,0.0,0.0,0.0,5
110CYL068_1137_1188,3.25,3.0,3.0,0.43,0.71,0.0,4
110CYL068_1189_1328,3.4,3.4,3.2,0.49,0.49,0.4,5


# Creating combined table (average of reader and writer)

In [19]:
raw = pd.read_csv("raw_refined.csv", index_col=0)
writer = pd.read_csv("writer_refined.csv", index_col=0)
reader = pd.read_csv("reader_refined.csv", index_col=0)

In [20]:
common = sorted(list(set(writer.index).intersection(set(reader.index))))
print(len(common))

10062


In [21]:
combined = writer.loc[common] + reader.loc[common]
combined[["V", "A","D"]] = combined[["V", "A","D"]]/2
combined = combined.drop(["stdV", "stdA", "stdD", "N"], axis=1)

In [22]:
combined["text"] = raw.loc[common]

In [23]:
print(combined.shape)
combined.head()

(10062, 4)


Unnamed: 0_level_0,V,A,D,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
110CYL068_1036_1079,3.0,3.0,3.2,"Remember what she said in my last letter? """
110CYL068_1079_1110,2.8,3.1,2.8,If I wasn't working here.
110CYL068_1127_1130,3.0,3.0,3.0,".."""
110CYL068_1137_1188,3.425,3.0,3.2,Goodwill helps people get off of public assist...
110CYL068_1189_1328,3.535,3.285,3.435,Sherry learned through our Future Works class ...


### Quick sanity check:

In [24]:
for d in ["V", "A", "D"]:
    print("Min {}: {}".format(d, combined.loc[combined[d].argmin(), "text"]))
    print("Max {}: {}".format(d, combined.loc[combined[d].argmax(), "text"]))

Min V: "Fuck you"
Max V: lol Wonderful Simply Superb!
Min A: I was feeling calm and private that night.
Max A: "My God, yes, yes, yes!"
Min D: I shivered as I walked past the pale man’s blank eyes, wondering what they were staring at.
Max D: “NO”
