In [1]:
import pandas as pd
import nltk

In [2]:
sentence = """At eight o'clock on Thursday morning... Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)
tokens

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 '...',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [3]:
tagged = nltk.pos_tag(tokens)
tagged[0:6]

[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN')]

In [4]:
# %load trainer.py
from pickle import dump
from pickle import load
import nltk
import sys

# Trains and saves a tagger model to a file
# @filename name of file to save to
# @train_set the tagged set to train on
def train_and_save(filename, train_set):
    outfile = open(filename, 'wb')
    t = nltk.UnigramTagger(train_set)
    dump(t, outfile, -1)
    outfile.close()

# Loads a tagger from file
# @filename
# @return tagger object
def load_tagger(filename):
    infile = open(filename, 'rb')
    t = load(infile)
    infile.close()
    return t

# Uncomment to train - Trains over ntlk brown corpus
#brown_train = nltk.corpus.brown.tagged_sents()
#train_and_save('models/brown_all.pkl',brown_train)

# Loads tagger, loop will take in sentence and return list of tagged tokens

def test_tagger():
    t = load_tagger('models/brown_all.pkl')
    while 1:
        print("Enter command:")
        cmd = input()
        tokens = nltk.word_tokenize(cmd)
        t.tag(tokens)
        tagged = t.tag(tokens)
        print(tagged)


In [14]:
# %load '../Statistics/statsop.py'
import pandas as pd
import os
import os.path
import fnmatch

class StatsOp:
   
    #constructor
    def __init__(self):
        self.isInitialized = False
        self.columns = []
        self.rows = []
        
#<--------------- Table Operations --------------->
    
    #update value
    def updateCell(self, col, row, value):
        if self.isInitialized:
            df = self.data
            df.set_value(row, col, value)
            self.data = df
        else:
            return None
    
    #append value (row) into given column
    def insertRow(self, col, value):
        if self.isInitialized:
            df = self.data
            dfTemp = pd.DataFrame(df.iloc[[0]])
            for i, row in dfTemp.iterrows():
                for colName in list(df):
                    dfTemp.loc[i, colName] = np.nan
            dfTemp.set_value(0, col, value)
            df = df.append(dfTemp)
            df = df.reset_index(drop=True)
            self.data = df
        else:
            return None
    
#<--------------- Getter & Setter Operations --------------->
    #getter and setter for operation
    def setOperation(self, op):
        self.operation = op
        
    def getOperation(self):
        return self.operation
 
    
    #getter & setter for the filename
    def setFilename(self, fName):
        self.fileName = fName
    
    def getFilename(self):
        return self.fileName
    
    #reads in csv file into a dataframe and stores that df as data
    #TODO: handle cases where the file is not in the format of csv...
    def setData(self, fName):
        #name.csv
        hasExtension = fName.find(".")
        status = False
        if hasExtension < 0:
            filename = fName + ".csv"
            for file in os.listdir('.'):
                if fnmatch.fnmatch(file, '*.csv'):
                    if filename == file:
                        fName = filename
                        status = True
        else:
            status = os.path.isfile(fName)

        if status:
            self.data = pd.read_csv(fName)
            self.isInitialized = True
        else:
            print("404 File:" + fName +" Not Found!!!!!!!!!")
        self.isInitialized = status
        return status
    
    
    def getData(self):
        if self.isInitialized:
            return self.data
        else:
            return None
        
        
#<--------------- Querying Operations --------------->
       
    def checkInitialized(self):
        return self.isInitialized
    
    #returns the mean for the given column
    def calculateColumnMean(self,col):
        df = self.data
        mean = df[col].mean()
        return mean
    
    #returns the mean for the given row
    def calculateRowMean(self, row):
        df = self.data
        mean = df.iloc[row].mean()
        return mean
    
    #returns an array of the mean for each column
    def calculateColumnsMean(self):
        if self.isInitialized:
            dataframe = self.data
            columns = self.columns
            result = []
            for col in columns:
                mean = dataframe[col].mean()
                result.append(mean)
                print(col + " " + str(mean))
            return result
        else:
            return None
       
    def describeColumn(self):
        if self.isInitialized:
            dataframe = self.data
            columns = self.columns
            result = []
            for col in columns:
                #description is a dataframe
                description = dataframe[col].describe()
                result.append(description)
            return result
        else:
            return None
    
     #returns an array of the mean for each row
    def calculateRowsMean(self):
        if self.isInitialized:
            dataframe = self.data
            rows = self.rows
            result = []
            for row in rows:
                mean = dataframe.iloc[row].mean()
                result.append(mean)
            return result
        else:
            return None

        
#<--------------- Print Operations --------------->

    #print column
    def printColumn(self,col):
        if self.isInitialized:
            df = self.data
            print(df[col])
        else:
            print("No Data Available")
    
    
    #print row
    def printRow(self,row):
        if self.isInitialized:
            df = self.data
            print(df.iloc[[row]])
        else:
            print("No Data Available")
        
    #List of all column and row names
    def getColumnNames(self):
        if self.isInitialized:
            df = self.data
            return list(df)
        else:
            return None
    
    def getRowNames(self):
        if self.isInitialized:
            df = self.data
            return list(df.index)
        else:
            return None
    
    
#<--------------- Array Operations --------------->

    #Unused for now...
    
    #getter & setter for columns
    #set column array = array
    def setColumns(self, cols):
        self.columns = cols
        
    #add columns to the array
    def addColumn(self, col):
        self.columns.append(col)
    
    def getColumns(self):
        return self.columns
    
    #getter & setter for row
    #set array = array
    def setRows(self, rows):
        self.rows = rows
    #add individual rows    
    def addRow(self, row):
        self.rows.append(row)
        
    def getRows(self):
        return self.rows
    
#<--------------- Misc Operations --------------->
   
    #example function    
    def testFunc(self):
        return 'hello world'
    

In [47]:
# %load synthesis.py
from nltk import word_tokenize
from nltk import tag
import sys
import string

import trainer
# sys.path.insert(0, '../Statistics')
# from statsop import StatsOp


class Synthesizer:

    def __init__(self):
        self.stats = StatsOp()
        self.tagger = trainer.load_tagger('models/brown_all.pkl')

        self.labels = {}
        self.labels['verb'] = 'VB'
        self.labels['noun'] = 'NN'

        self.specialNouns = []
        self.specialNouns.append('data')
        self.specialNouns.append('row')
        self.specialNouns.append('col')
        self.specialNouns.append('column')
        
        self.commandStack = []

    def tokenize(self, cmd):
        return word_tokenize(cmd)

    def tag(self, tokens):
        return self.tagger.tag(tokens)

    def synonym_look_up(self,word):
        #check if word is in synonynm
        #return the appropriatly mapped word
        return word
    
    def print_requested (self, objs):
        print("Printing Requested")
    
        
    # attempt to initialize stats
    def read_data_cmd(self, tokens): 
        if tokens[0][0].lower() == 'read': #handles case of reading in data
            # next argument must be filename, attempt to set up the data
            if len(tokens) == 2:
                return self.stats.setData(tokens[1][0])
    
    def print_data_cmd(self, tokens):
        print ("attempting to print..")
        command = self.build_command(tokens)
        print(self.commandStack)
        if tokens[0][0].lower() == 'show': # handles cases of printing or showing data
            [print_requested(n) for n in self.commandStack]
            
    def run_data_cmd(self, tokens):
        if tokens[0][0].lower() == 'command': # handles cases statistic commands on data
            print("printing out requested data")
            

    # parse a noun or none tag and see what it is
    def parse_noun_or_none(self, nn):
        val = None

        # test if int
        try:
            val = int(nn)
        except ValueError:
            pass

        # test if int
        try:
            val = float(nn)
        except ValueError:
            pass

        # we now know val is a variable
        if val is None: # check if nn is even a column or row name
            pass

    # checks to see if arg is a name inside list, row or col
    # returns True if it is
    def check_name_in_list(self, arg, li):
        if arg in li:
            return True
        else:
            return False
        
    def build_command(self, tokens):
        command = []
        previousNoun = False
        for pair in tokens:
            if(pair[1] == "VB"): #action to perform
                command.append(pair[0])
                self.commandStack.append(command)
                command = []
                
            if(pair[1] == "CD" and previousNoun): #cardinal number (or location) after a given noun
                command.append(pair[0])

            if(pair[1][0] == "N"): #noun
                command.append(pair[0])
                previousNoun = True
            else:
                previousNoun = False
            
            if(pair[1] == "CC"):
                self.commandStack.append(command)
                command = []
                
        self.commandStack.append(command)
    
    def synthesize(self, tagged, cmd):
        stats = self.stats
        labels = self.labels

        # check if we've already initialized
        '''
        if stats.checkInitialized():

            # Check if cmd is a row or column
            cols = stats.getColumnNames()
            rows = stats.getRowNames()

            if check_name_in_list(cmd, cols):
                print('name found in cols')
            elif check_name_in_list(cmd, rows):
                print('name found in rows')
        '''

        # check if command 
        if tagged[0][1] == labels['verb']:
            print("Processing Verb")
            # Try to interpret this as a read initialization command
            if not stats.checkInitialized():
                self.read_data_cmd(tagged)
            else:
                print("Trying a non read command")
                command = self.synonym_look_up(tagged[0][0].lower())
                if command == "show":
                    self.print_data_cmd(tagged)
                pass # test printing column/row commands here?

        if tagged[0][1] == labels['noun']:

            print("Processing Noun")
            if not stats.checkInitialized():
                pass
            else:
                pass


In [48]:
# initialize synthesizer
s = Synthesizer()

In [None]:
# Read commands
while(1):
    print('Enter command:')
    cmd = input()
    tokens = s.tokenize(cmd)
    tagged = s.tag(tokens)

    print(tagged)
    s.synthesize(tagged, cmd)
    if(cmd == "quit"):
        break

Enter command:
read dummydata
[('read', 'VB'), ('dummydata', None)]
Processing Verb
Enter command:
show me column 1 and column 2
[('show', 'VB'), ('me', 'PPO'), ('column', 'NN'), ('1', 'CD'), ('and', 'CC'), ('column', 'NN'), ('2', 'CD')]
Processing Verb
Trying a non read command
attempting to print..
[['show'], ['column', '1'], ['column', '2']]
['show']
['column', '1']
['column', '2']
Enter command:
