In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [None]:
!pip install pysqlite3
!pip install pandas
!pip install tensorflow

In [None]:

import sqlite3
import json
from datetime import datetime

timeframe = '2015-05'
sql_transaction = []
start_row = 0
cleanup = 1000000

connection = sqlite3.connect('{}.db'.format(timeframe))
c = connection.cursor()

def create_table():
  c.execute("""CREATE TABLE IF NOT EXISTS parent_reply
  (parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT,
  comment TEXT, subreddit TEXT, unix INT, score INT)""")

def format_data(data):
  data = data.replace("\n"," newlinechar ").replace("\r"," newlinechar ").replace('"',"'")
  return data

def find_existing_score(pid):
  try:
    sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
    c.execute(sql)
    result = c.fetchone()
    if result != None:
      return result[0]
    else: return False
  except Exception as e:
    #print("find_parent",e)
    return False

def acceptable(data):
  if len(data.split(' ')) > 50 or len(data) < 1:
    return False
  elif len(data) > 1000:
    return False
  elif data == '[deleted]' or data == '[removed]':
    return False
  else:
    return True

def find_parent(pid):
  try:
    sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
    c.execute(sql)
    result = c.fetchone()
    if result != None:
     return result[0]
    else: return False
  except Exception as e:
    #print("find_parent",e)
    return False

def transaction_bldr(sql):
  global sql_transaction
  sql_transaction.append(sql)
  if len(sql_transaction) > 1000:
    c.execute('BEGIN TRANSACTION')
    for s in sql_transaction:
      try:
        c.execute(s)
      except:
        pass
    connection.commit()
    sql_transaction = []

def sql_insert_replace_comment(commentid, parentid, parent, comment, subreddit, time, score):
  try:
    sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id = ?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
    transaction_bldr(sql)
  except Exception as e:
    print('s-UPDATE insertion', str(e))

def sql_insert_has_parent(commentid, parentid, parent, comment, subreddit, time, score):
  try:
   sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
   transaction_bldr(sql)
  except Exception as e:
    print('s-PARENT insertion', str(e))

def sql_insert_no_parent(commentid, parentid, comment, subreddit, time, score):
  try:
    sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
    transaction_bldr(sql)
  except Exception as e:
    print('s-NO_PARENT insertion', str(e))

if __name__ == "__main__":
  create_table()
  row_counter = 0
  paired_rows = 0

  with open("/content/drive/MyDrive/Colab Notebooks/Chatbot/Chatdata/{}/RC_{}".format(timeframe.split('-')[0], timeframe), buffering=1000) as f:
    for row in f:
      row_counter += 1
      row = json.loads(row)
      comment_id = row['name']
      parent_id = row['parent_id']
      body = format_data(row['body'])
      created_utc = row['created_utc']
      score = row['score']
      subreddit = row['subreddit']
      parent_data = find_parent(parent_id)

      if score >= 2:
        if acceptable(body):
          existing_comment_score = find_existing_score(parent_id)
          if existing_comment_score:
            if score > existing_comment_score:
              sql_insert_replace_comment(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
          else:
            if parent_data:
              sql_insert_has_parent(comment_id, parent_id, parent_data, body, subreddit, created_utc, score)
              paired_rows +=1
            else:
              sql_insert_no_parent(comment_id, parent_id, body, subreddit, created_utc, score)
      
      if row_counter % 100000 == 0:
        print("Total rows read: {}, Paired rows: {}, Time: {}".format(row_counter, paired_rows, str(datetime.now())))

      if row_counter > start_row:
        if row_counter % cleanup == 0:
          print("Cleanin up!")
          sql = "DELETE FROM parent_reply WHERE parent IS NULL"
          c.execute(sql)
          connection.commit()
          c.execute("VACUUM")
          connection.commit()

In [None]:
import sqlite3
import pandas as pd

timeframes = ['2015-05']

for timeframe in timeframes:
    connection = sqlite3.connect('{}.db'.format(timeframe))
    c = connection.cursor()
    limit = 5000
    last_unix = 0
    cur_length = limit
    counter = 0
    test_done = False

    while cur_length == limit:

        df = pd.read_sql("SELECT * FROM parent_reply WHERE unix > {} and parent NOT NULL and score > 0 ORDER BY unix ASC LIMIT {}".format(last_unix,limit),connection)
        last_unix = df.tail(1)['unix'].values[0]
        cur_length = len(df)

        if not test_done:
            with open('test.from','a', encoding='utf8') as f:
                for content in df['parent'].values:
                    f.write(content+'\n')

            with open('test.to','a', encoding='utf8') as f:
                for content in df['comment'].values:
                    f.write(str(content)+'\n')

            test_done = True

        else:
            with open('train.from','a', encoding='utf8') as f:
                for content in df['parent'].values:
                    f.write(content+'\n')

            with open('train.to','a', encoding='utf8') as f:
                for content in df['comment'].values:
                    f.write(str(content)+'\n')

        counter += 1
        if counter % 20 == 0:
            print(counter*limit,'rows completed so far')

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/Chatbot 

In [None]:
!git clone --recursive https://github.com/daniel-kukiela/nmt-chatbot 

In [None]:
cd nmt-chatbot 

In [None]:
!pip install -r requirements.txt 

In [None]:
cd setup 

In [None]:
!python prepare_data.py 

In [None]:
cd .. 

In [None]:
!python train.py 

In [None]:
!python inference.py 