In [1]:
import configparser
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
config = configparser.ConfigParser()
config.read('clustertab.config')

['clustertab.config']

In [3]:
db = config['POSTGRES']['PG_DB']
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

In [4]:
db

'tabmcq'

In [5]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

#using psycopg2 to test connection since there are no tables
import psycopg2
try:
    conn = psycopg2.connect(host=host,dbname=db,user=user,password=passwd,port=port)
except Exception as e:
    print(e)
    
conn.set_session(autocommit=True)

try:
    cur = conn.cursor()
    
except:
    print(e)

In [6]:
credentials

'postgresql://postgres:1234@172.17.0.2:5432/tabmcq'

In [7]:
#Helper functions to work with the database
def schemaGen(dataframe, schemaName):
    localSchema = pd.io.sql.get_schema(dataframe,schemaName)
    localSchema = localSchema.replace('TEXT','VARCHAR(255)').replace('INTEGER','NUMERIC').replace('\n','').replace('"',"")
    return "".join(localSchema)

#Using pandas read_sql for getting schema
def getSchema(tableName, credentials):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),con=credentials)
    return schema

#Issue is in using pd.read_sql to write data to the database. so using psycopg2
def queryTable(query):
    try:
        schema = cur.execute(query)
        return 
    except Exception as e:
        print(e)
        
#This doesn't return anything

#Using the pd.read_sql for getting data from db
def queryBase(query):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

#This returns the dataframe

In [6]:
tabmcq = pd.read_csv('MCQs.tsv',sep='\t')

In [7]:
tabmcq.head()

Unnamed: 0,QUESTION,QUESTION-ALIGNMENT,CHOICE 1,CHOICE 2,CHOICE 3,CHOICE 4,CORRECT CHOICE,RELEVANT TABLE,RELEVANT ROW,RELEVANT COL
0,Which orbital event is the day with the longes...,123,Winter solstice,Fall equinox,Spring equinox,Summer solstice,4,regents-01,1,1
1,The _______ is the day with the longest period...,13,summer solstice,winter solstice,spring equinox,fall equinox,1,regents-01,1,1
2,What is the orbital event with the longest day...,12345,Summer solstice,Winter solstice,Spring equinox,Fall equinox,1,regents-01,1,1
3,The summer solstice has the _______ period of ...,123,longest,shortest,midrange,smallest,1,regents-01,1,3
4,The summer solstice has what period of dayligh...,13,Shortest,Longest,Midrange,,2,regents-01,1,3


In [8]:
tabmcq.shape

(9092, 10)

In [10]:
tabmcq["RELEVANT TABLE"].unique()

array(['regents-01', 'regents-02', 'regents-03', 'regents-04',
       'regents-05&09', 'regents-06', 'regents-07', 'regents-08',
       'regents-10', 'regents-11&12', 'regents-13', 'regents-14',
       'regents-15', 'regents-16', 'regents-17', 'regents-19',
       'regents-20', 'regents-21', 'regents-22', 'regents-23',
       'regents-24', 'regents-25&26', 'regents-27', 'regents-28',
       'regents-29', 'regents-30', 'regents-31', 'regents-32',
       'regents-33', 'regents-34', 'regents-35', 'regents-36',
       'regents-37', 'regents-38', 'regents-39', 'regents-40',
       'regents-41', 'regents-42', 'regents-43', 'monarch-44',
       'monarch-45', 'monarch-46', 'monarch-47', 'monarch-48',
       'monarch-49', 'monarch-50', 'monarch-51', 'monarch-52',
       'monarch-53', 'monarch-54', 'monarch-55', 'monarch-56',
       'monarch-57', 'monarch-58', 'monarch-60', 'monarch-61',
       'monarch-62', 'monarch-63', 'monarch-64', 'monarch-65',
       'monarch-66', 'monarch-67'], dtype=obje

In [15]:
#check one of the supporting files
import boto3

awsConf = configparser.ConfigParser()
awsConf.read('calter.config')

['calter.config']

In [20]:
awskey = awsConf['AWS']['KEY']
awssecret = awsConf['AWS']['SECRET']
awsregion = awsConf['AWS']['REGION']

In [22]:
athenaClient = boto3.client("athena",
                    aws_access_key_id=awskey,
                    aws_secret_access_key=awssecret,
                    region_name=awsregion)


In [38]:
Dict = {}
def download_and_load_query_results(client, query_response):
    while True:
        try:
            client.get_query_results(
                QueryExecutionId=query_response["QueryExecutionId"]
            )
            break
        except Exception as err:
            if "not yet finished" in str(err):
                time.sleep(2)
            else:
                raise err
                
    temp_file_location = "athena_query_results.csv"
    s3_client = boto3.client("s3",
                            aws_access_key_id=awskey,
                            aws_secret_access_key=awssecret,
                            region_name=awsregion)
    s3_client.download_file(
    "tab-mcq-de-output",
    f"{query_response['QueryExecutionId']}.csv",
    temp_file_location)
    return pd.read_csv(temp_file_location)


In [53]:
#The database will hold the table, and the data will be queried from the Table
response = athenaClient.start_query_execution(
    QueryString = "SELECT * FROM tables;",
    QueryExecutionContext={"Database":"tabmcqhoc"},
    ResultConfiguration={
        "OutputLocation":"s3://tab-mcq-de-output/",
        "EncryptionConfiguration":{"EncryptionOption":"SSE_S3"}    
    })

ClientError: An error occurred (UnrecognizedClientException) when calling the StartQueryExecution operation: The security token included in the request is invalid.

In [44]:
getPartitionData = download_and_load_query_results(athenaClient,response)

In [45]:
getPartitionData.shape

(8302, 10)

In [47]:
getPartitionData.head(15)

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,partition_0
0,,type of population or resource,"(increases, decreases)",,type of population or resource,"(increases, decreases)",,,,monarch
1,When,predators,increase,", the result is that",prey,decrease,,,,monarch
2,When,prey,increase,", the result is that",prey,increase,,,,monarch
3,When,the amount of food,increases,", the result is that",the population,increases,,,,monarch
4,When,the amount of resources,decreases,", the result is that",competition,increases,,,,monarch
5,Organism,Steps in life cycle,,,,,,,,regents
6,frog,"egg, tadpole, adult",,,,,,,,regents
7,butterfly,"egg, larva, chrysalis, adult",,,,,,,,regents
8,,CHARACTERISTIC Physical characteristic of an a...,,"INHERITED? Is the characterstic inherited, lea...",,,,,,regents
9,An,facial scar,is,acquired,,,,,,regents


In [50]:
getPartitionData.loc[150:180]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,partition_0
150,a scientist,measure,the volume of a liquid,,,,,,,auto
151,living things,need,food & water & air,,,,,,,auto
152,animals,need,air & water & food,,,,,,,auto
153,plants,need,air & water & nutrients & light,,,,,,,auto
154,plants,require,air & water & nutrients & light in order,,,,,,,auto
155,tough outer coatings,protect,the plant from the environment,,,,,,,auto
156,all animals,take in,food & water & nutrients,,,,,,,auto
157,the body,growing,new bone cells,,,,,,,auto
158,new bone cells,repair,the break,,,,,,,auto
159,new clean forms of energy,conserve,fossil fuels,,,,,,,auto


In [13]:
getPartitionData = pd.read_csv("athena_query_results.csv")

The crawler has catalogued the files, the catalog has been created, the above function pulls the data out of the Athena database into csv file. The top few rows seems to have lot of Null values. Further exploration show that the files have been scrawled successfully

In [14]:
getPartitionData[getPartitionData.partition_0 == "monarch"]

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,partition_0
0,,type of population or resource,"(increases, decreases)",,type of population or resource,"(increases, decreases)",,,,monarch
1,When,predators,increase,", the result is that",prey,decrease,,,,monarch
2,When,prey,increase,", the result is that",prey,increase,,,,monarch
3,When,the amount of food,increases,", the result is that",the population,increases,,,,monarch
4,When,the amount of resources,decreases,", the result is that",competition,increases,,,,monarch
...,...,...,...,...,...,...,...,...,...,...
5545,A(n),Narrow-nosed planigale,"weighs, on average,",less than 1,kilograms,,,,,monarch
5546,A(n),Long-tailed planigale,"weighs, on average,",less than 1,kilograms,,,,,monarch
5547,A(n),Cinereus shrew,"weighs, on average,",less than 1,kilograms,,,,,monarch
5548,A(n),Black myotis,"weighs, on average,",less than 1,kilograms,,,,,monarch


In [15]:
getPartitionData.fillna('NA',inplace=True)
getPartitionData.head(5)

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,partition_0
0,,type of population or resource,"(increases, decreases)",,type of population or resource,"(increases, decreases)",,,,monarch
1,When,predators,increase,", the result is that",prey,decrease,,,,monarch
2,When,prey,increase,", the result is that",prey,increase,,,,monarch
3,When,the amount of food,increases,", the result is that",the population,increases,,,,monarch
4,When,the amount of resources,decreases,", the result is that",competition,increases,,,,monarch


In [16]:
getPartitionData.to_csv('cleaned_tables_data.csv')

In [55]:
schemaGen(tabmcq,'mcqtable')

'CREATE TABLE mcqtable (QUESTION VARCHAR(255),  QUESTION-ALIGNMENT VARCHAR(255),  CHOICE 1 VARCHAR(255),  CHOICE 2 VARCHAR(255),  CHOICE 3 VARCHAR(255),  CHOICE 4 VARCHAR(255),  CORRECT CHOICE NUMERIC,  RELEVANT TABLE VARCHAR(255),  RELEVANT ROW NUMERIC,  RELEVANT COL NUMERIC)'

In [60]:
schemaGen(getPartitionData,'tableData')

'CREATE TABLE tableData (col0 VARCHAR(255),  col1 VARCHAR(255),  col2 VARCHAR(255),  col3 VARCHAR(255),  col4 VARCHAR(255),  col5 VARCHAR(255),  col6 VARCHAR(255),  col7 VARCHAR(255),  col8 VARCHAR(255),  partition_0 VARCHAR(255))'

In [22]:
createmcq = """CREATE TABLE mcqtable (QUESTION VARCHAR(255),  QUESTION_ALIGNMENT VARCHAR(255),
                CHOICE_1 VARCHAR(255),  CHOICE_2 VARCHAR(255),  CHOICE_3 VARCHAR(255),  
                CHOICE_4 VARCHAR(255),  CORRECT_CHOICE NUMERIC,  RELEVANT_TABLE VARCHAR(255),  
                RELEVANT_ROW NUMERIC,  RELEVANT_COL NUMERIC)"""
queryTable(createmcq)

In [23]:
createTabmcq = """CREATE TABLE tableData (col0 VARCHAR,  col1 VARCHAR,  col2 VARCHAR,  col3 VARCHAR,  col4 VARCHAR,  col5 VARCHAR,  col6 VARCHAR,  col7 VARCHAR,  col8 VARCHAR,  partition_0 VARCHAR)"""
queryTable(createTabmcq)

In [24]:
insertmcq = """COPY mcqtable FROM '/var/lib/postgresql/data/MCQs.tsv' DELIMITER E'\t' CSV HEADER"""
queryTable(insertmcq)

In [25]:
insertTable = """COPY tableData FROM '/var/lib/postgresql/data/athena_query_results.csv' DELIMITER ',' CSV HEADER"""
queryTable(insertTable)

In [26]:
queryBase("""SELECT * FROM mcqtable LIMIT 5""")

Unnamed: 0,question,question_alignment,choice_1,choice_2,choice_3,choice_4,correct_choice,relevant_table,relevant_row,relevant_col
0,Which orbital event is the day with the longes...,123,Winter solstice,Fall equinox,Spring equinox,Summer solstice,4.0,regents-01,1.0,1.0
1,The _______ is the day with the longest period...,13,summer solstice,winter solstice,spring equinox,fall equinox,1.0,regents-01,1.0,1.0
2,What is the orbital event with the longest day...,12345,Summer solstice,Winter solstice,Spring equinox,Fall equinox,1.0,regents-01,1.0,1.0
3,The summer solstice has the _______ period of ...,123,longest,shortest,midrange,smallest,1.0,regents-01,1.0,3.0
4,The summer solstice has what period of dayligh...,13,Shortest,Longest,Midrange,,2.0,regents-01,1.0,3.0


In [27]:
queryBase("""SELECT * FROM tableData LIMIT 5""")

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,partition_0
0,,type of population or resource,"(increases, decreases)",,type of population or resource,"(increases, decreases)",,,,monarch
1,When,predators,increase,", the result is that",prey,decrease,,,,monarch
2,When,prey,increase,", the result is that",prey,increase,,,,monarch
3,When,the amount of food,increases,", the result is that",the population,increases,,,,monarch
4,When,the amount of resources,decreases,", the result is that",competition,increases,,,,monarch
