## Exploratory Analysis

Exploratory analysis of SemEval 2017 dataset. Don't download files as I've already got them 

In [22]:
import os
from glob import glob
import pandas as pd

files = glob("../data/SemEval2017/GOLD/Subtask_A/*.txt", recursive=True)
files

['../data/SemEval2017/GOLD/Subtask_A/twitter-2016test-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2013dev-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2015test-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2014test-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2013train-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2016dev-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2013test-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2016devtest-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2016train-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2014sarcasm-A.txt',
 '../data/SemEval2017/GOLD/Subtask_A/README.txt',
 '../data/SemEval2017/GOLD/Subtask_A/twitter-2015train-A.txt']

In [23]:
import pandas as pd

def read_table(path):
    """
    Read SemEval table and return dataframe
    """
    df = pd.read_table(path, header=None)
    # Get rid of last column

    if len(df.columns) > 3:
        del df[3]
    df.columns = ["id", "label", "text"]
    #df.set_index("id", inplace=True)
    return df

pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 100

read_table(files[0])



Unnamed: 0,id,label,text
0,619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: The Walll - opening 29 Sept is now making waves. Watch the trailer on Rolling Stone - look..."
1,619969366986235905,neutral,Order Go Set a Watchman in store or through our website before Tuesday and get it half price! #GSAW @GSAWatchmanBook https://t.co/KET6EGD1an
2,619971047195045888,negative,"If these runway renovations at the airport prevent me from seeing Taylor Swift on Monday, Bad Blood will have a new meaning."
3,619974445185302528,neutral,"If you could ask an onstage interview question at Miss USA tomorrow, what would it be?"
4,619987808317407232,positive,A portion of book sales from our Harper Lee/Go Set a Watchman release party on Mon. 7/13 will support @CAP_Tulsa and the great work they do.
...,...,...,...
20627,681877834982232064,neutral,"@ShaquilleHoNeal from what I think you're asking, in no order. Future, Drake, Thug, Cole, Kendrick and Tiller a close 6th"
20628,681879579129200640,positive,"Iran ranks 1st in liver surgeries, Allah bless the country."
20629,681883903259357184,neutral,"Hours before he arrived in Saudi Arabia on Tuesday, Turkish President Recep Tayyip Erdogan accused Syria's president of ""mercilessly""..."
20630,681904976860327936,negative,@VanityFair Alex Kim Kardashian worth how to love Kim Kardashian she's so bad Sun Conure to


In [35]:
train_files = [f for f in files if any(t in f for t in {"train"})]
dev_files = [f for f in files if any(t in f for t in {"devtest", "dev"})]
test_files = [f for f in files if any(t in f and "devtest" not in f for t in {"test"} )]

print("Train files : ", train_files)

print("Dev files   : ", dev_files)

print("Test files  : ", test_files)


Train files :  ['../data/SemEval2017/GOLD/Subtask_A/twitter-2013train-A.txt', '../data/SemEval2017/GOLD/Subtask_A/twitter-2016train-A.txt', '../data/SemEval2017/GOLD/Subtask_A/twitter-2015train-A.txt']
Dev files   :  ['../data/SemEval2017/GOLD/Subtask_A/twitter-2013dev-A.txt', '../data/SemEval2017/GOLD/Subtask_A/twitter-2016dev-A.txt', '../data/SemEval2017/GOLD/Subtask_A/twitter-2016devtest-A.txt']
Test files  :  ['../data/SemEval2017/GOLD/Subtask_A/twitter-2016test-A.txt', '../data/SemEval2017/GOLD/Subtask_A/twitter-2015test-A.txt', '../data/SemEval2017/GOLD/Subtask_A/twitter-2014test-A.txt', '../data/SemEval2017/GOLD/Subtask_A/twitter-2013test-A.txt']


In [36]:

train_df = pd.concat([read_table(f) for f in train_files])
train_df["slice"] = "train"
dev_df = pd.concat([read_table(f) for f in dev_files])
dev_df["slice"] = "dev"
test_df = pd.concat([read_table(f) for f in test_files])
test_df["slice"] = "test"

df = pd.concat([train_df, dev_df, test_df])
print("Len train : ", train_df.shape)
print("Len dev   : ", dev_df.shape)
print("Len test  : ", test_df.shape)


Len train :  (16041, 4)
Len dev   :  (5620, 4)
Len test  :  (28422, 4)


There are repeated rows => remove duplicates

In [37]:
df.drop_duplicates("id", inplace=True)

train_df = df[df["slice"] == "train"]
dev_df = df[df["slice"] == "dev"]
test_df = df[df["slice"] == "test"]
print("Len train : ", train_df.shape)
print("Len dev   : ", dev_df.shape)
print("Len test  : ", test_df.shape)

Len train :  (15493, 4)
Len dev   :  (5582, 4)
Len test  :  (28244, 4)
