# Introduction
#### Author: Kaj Meijer
This notebook contains code that was used to analyse the neg-raising data and retrieve the standard deviation and mean for **The relation of responsivity and neg-raising**

In [1]:
# import block
import pandas as pd
import numpy as np

In [2]:
# load the negraising dataset
df_negraising = pd.read_csv('data/mega-negraising-v1.tsv', sep="\t")

In [3]:
class NegRaising:
    
    '''
    Constructor
    @param pd.DataFrame  The neg-raising dataframe
    '''
    def __init__(self, dataframe):
        
        # initialize the class variables
        self._dataframe = dataframe
        self._verbs = dataframe.verb.unique()
        self._frames = dataframe.frame.unique()
        self._tenses = dataframe.tense.unique()
        self._subjects = dataframe.subject.unique()
        self._columns = None
        self._averages = None

    '''
    Function to create the columns for the resulting dataframe
    @return np.array  List with the column names of the analysed data
    '''
    def create_columns(self):
        
        # return the columns if they've already been created
        if self._columns:
            return self._columns
        
        # base list for the columns
        columns = ['verb']
        
        # iterate over the frames, tensen and subjects
        for frame in self._frames:
            for tense in self._tenses:
                for subject in self._subjects:
            
                    # add column names based on the frame, subject and tense
                    columns.append(frame+'{'+subject+", "+tense+'}')
                
        # add column names for the tenses
        for tense in self._tenses:
            columns.append("tense: "+tense)
            
        # add column names for the subjects
        for subject in self._subjects:
            columns.append("subject: "+subject)
            
        # add the total column at the end
        columns.append('total')
        
        # convert to numpy array
        columns = np.array(columns)
        
        # store the columns
        self._columns = columns
        
        # return the columns
        return columns
    
    '''
    Get the average score of a frame
    @param  string    The verb
    @param  string    The frame
    @param  string    The subject
    @param  string    The tense
    @return float     The mean of the frame
    '''
    def frame_average(self, verb, frame, subject = 'third', tense = 'past'):
        
        # retrieve the judgements of this frame
        judgements = self._dataframe.loc[(self._dataframe.verb == verb) & (self._dataframe.frame == frame) &
                           (self._dataframe.tense == tense) & (self._dataframe.subject == subject)]
        
        # remove NaN values
        cleaned = judgements.negraising.dropna().values
        
        # check if there are still any values left, otherwise return 0
        if not len(cleaned):
            return 0
        
        # return the mean of the cleaned judgements
        return np.mean(cleaned)
    
    '''
    Get the average score of a frame
    @param  string    The verb
    @param  string    The tense
    @return float     The mean of the tense
    '''
    def tense_averages(self, verb, tense):
        
        # retrieve the judgements of this tense
        judgements = self._dataframe.loc[(self._dataframe.verb == verb) & (self._dataframe.tense == tense)]
        
        # remove NaN values
        cleaned = judgements.negraising.dropna().values
        
        # check if there are still any values left, otherwise return 0
        if not len(cleaned):
            return 0
        
        # return the mean of the cleaned judgements
        return np.mean(cleaned)
    
    '''
    Get the average score of a frame
    @param  string    The verb
    @param  string    The subject
    @return float     The mean of the subject
    '''
    def subject_averages(self, verb, subject):
        
        # retrieve the judgements of this subject
        judgements = self._dataframe.loc[(self._dataframe.verb == verb) & (self._dataframe.subject == subject)]
        
        # remove NaN values
        cleaned = judgements.negraising.dropna().values
        
        # check if there are still any values left, otherwise return 0
        if not len(cleaned):
            return 0
        
        # return the mean of the cleaned judgements
        return np.mean(cleaned)
    
    '''
    Get the average score of a frame
    @param  string    The verb
    @return float     The mean of the verb
    '''
    def total_average(self, verb):
        
        # retrieve the judgements of this verb
        judgements = self._dataframe.loc[(self._dataframe.verb == verb)]
        
        # remove NaN values
        cleaned = judgements.negraising.dropna().values
        
        # check if there are still any values left, otherwise return 0
        if not len(cleaned):
            return 0
        
        # return the mean of the cleaned judgements
        return np.mean(cleaned)
    
    '''
    Get the means of all frame-subject-tense combinations, tense means, 
    subject means and total mean for a verb
    @param  string    The verb
    @return np.array  The mean of the frame
    '''
    def verb_averages(self, verb):
        
        # the initial list of averages contain the verb itself
        averages = [verb]
        
        # add the means for all the frames
        for frame in self._frames:
            for tense in self._tenses:
                for subject in self._subjects:            
                    averages.append(self.frame_average(verb, frame))
                
        # add the means for the tenses
        for tense in self._tenses:
            averages.append(self.tense_averages(verb, tense))
        
        # add the means for the subjects
        for subject in self._subjects:
            averages.append(self.subject_averages(verb, subject))
            
        # add the total mean
        averages.append(self.total_average(verb))
            
        # return the 
        return averages
    
    '''
    Get the average score of a frame
    @param  string    The verb
    @param  string    The frame
    @param  string    The subject
    @param  string    The tense
    @return np.array  The mean of the frame
    '''
    def averages(self):
        
        # return the averages if they've already been created
        if self._averages:
            return self._averages
        
        # initial list of averages
        averages = []
        
        # add the averages of all verbs
        for verb in self._verbs:
            averages.append(self.verb_averages(verb))
            
        # convert to numpy array
        averages = np.array(averages)    
            
        # store the averages
        self._averages = averages
        
        # return the averages
        return averages
    
    '''
    Reset the averages so they will be calculated again
    '''
    def reset_averages(self):
        self._averages = None
    
    '''
    Convert the averages to a pandas dataframe
    @return pd.DataFrame  The dataframe containing the averages
    '''
    def to_dataframe(self):
        
        # create the columns
        columns = self.create_columns()
        
        # get the averages
        data = self.averages().T
        
        # create the dataframe
        df = pd.DataFrame(data, columns)
        
        # return the transposed dataframe 
        return df.T
    

In [4]:
# initialize the object
nr = NegRaising(df_negraising)

In [5]:
# create the neg-raising dataframe
df = nr.to_dataframe()

In [6]:
# write the dataframe to a file
df.to_csv('data/negraising-averages.csv')

In [7]:
# get the mean of the total values
np.mean(list(map(float, df.total.values)))

0.356346397000897

In [8]:
# get the mean of the total values
np.std(list(map(float, df.total.values)))

0.0835944835726565

In [9]:
# get the mean of the first subject values
np.mean(list(map(float, df['subject: first'].values)))

0.3466339809289809

In [10]:
# get the mean of the third subject values
np.mean(list(map(float, df['subject: third'].values)))

0.36605881307281307

In [11]:
# get the mean of the present tense values
np.mean(list(map(float, df['tense: present'].values)))

0.3092116216216216

In [12]:
# get the mean of the past tense values
np.mean(list(map(float, df['tense: past'].values)))

0.3509744324324324

In [13]:
# get the mean of the NP V that S frame in the third subject and past tense
np.mean(list(map(float, df['NP V that S{third, past}'].values)))

0.2170064864864865

In [14]:
# get the mean of the NP V that S frame in the third subject and past tense
np.std(list(map(float, df['NP V that S{third, past}'].values)))

0.1958441077809595

In [15]:
# get the max of the NP V that S frame in the third subject and past tense
np.max(list(map(float, df['NP V that S{third, past}'].values)))

0.754

In [16]:
# get the values of our frame
nr_values = list(map(float, df['NP V that S{third, past}'].values))

# sort the values
sorted_values = sorted(nr_values, reverse=True)

# get the total number of predicates
total = len(df_negraising.verb.unique())

# get the threshold for the top 4% of the predicates
print(sorted_values[round(total/ 20)])

0.5109999999999999
