In [1]:
import numpy as np
from __future__ import division
import sqlite3

In [2]:
sqlitefile = 'data.sqlite'

# Create database

In [None]:
conn = sqlite3.connect(sqlitefile)
c = conn.cursor()
c.execute('''CREATE TABLE sanders
       (datetime       REAL,
       sentiment      REAL,
       tweetID        TEXT);''')
c.execute('''CREATE TABLE trump
       (datetime       REAL,
       sentiment      REAL,
       tweetID        TEXT);''')
c.execute('''CREATE TABLE clinton
       (datetime       REAL,
       sentiment      REAL,
       tweetID        TEXT);''')
c.execute('''CREATE TABLE cruz
       (datetime       REAL,
       sentiment      REAL,
       tweetID        TEXT);''')
conn.commit()
conn.close()

# Insert data

In [None]:
import pandas as pd

In [None]:
candidates = ['sanders', 'trump', 'clinton', 'cruz']
for candidate in candidates:
    data = pd.read_csv('data_'+candidate+'_00000.csv', header=None,
                   names=['date', 'sentiment', 'tweetID'],
                   dtype={'date': np.float64, 'sentiment': np.float64, 'tweetID': str})
    conn = sqlite3.connect(sqlitefile)
    c = conn.cursor()
    c.executemany('''INSERT INTO '''+candidate+'''(datetime, sentiment, tweetID) VALUES (?, ?, ?)''', np.array(data))
    conn.commit()
    conn.close()

# Create downsampled database

In [5]:
sqlitefile_downsampled = 'data_downsampled.sqlite'

In [None]:
conn = sqlite3.connect(sqlitefile_downsampled)
c = conn.cursor()
c.execute('''CREATE TABLE sanders_downsampled
       (datetime       REAL,
       sentiment      REAL,
       tweet_count        INTEGER);''')
c.execute('''CREATE TABLE trump_downsampled
       (datetime       REAL,
       sentiment      REAL,
       tweet_count        INTEGER);''')
c.execute('''CREATE TABLE clinton_downsampled
       (datetime       REAL,
       sentiment      REAL,
       tweet_count        INTEGER);''')
c.execute('''CREATE TABLE cruz_downsampled
       (datetime       REAL,
       sentiment      REAL,
       tweet_count        INTEGER);''')
conn.commit()
conn.close()

<hr>
# Downsample

In [12]:
bin_size = 30 * 60 # 30 minutes, in seconds
min_tweets = 100 # minimum number of tweets for a valid entry

candidates = ['sanders', 'trump', 'clinton', 'cruz']

for candidate in candidates:
    
    conn = sqlite3.connect(sqlitefile_downsampled)
    c = conn.cursor()
    c.execute('''DELETE FROM '''+candidate+'''_downsampled WHERE 1=1''')
    conn.commit()
    conn.close()

    conn = sqlite3.connect(sqlitefile)
    c = conn.cursor()
    c.execute('''SELECT * FROM '''+candidate+''';''')
    all_rows = c.fetchall()
    prev_time = None
    sentiments = []
    tweet_count = 0
    for row in all_rows:
        sentiments.append(row[1])
        tweet_count += 1
        if not prev_time:
            prev_time = row[0]
            continue
        time = row[0]
        if time - prev_time > bin_size:
            # we've passed bin_size, wrap it up
            if time - prev_time > bin_size*2:
                # more than 2 bin_sizes have passed, we're missing data. Add an empty entry.
                conn2 = sqlite3.connect(sqlitefile_downsampled)
                c2 = conn2.cursor()
                c2.execute('''INSERT INTO '''+candidate+'''_downsampled(datetime, sentiment, tweet_count) VALUES (?,?,?);''',
                         (time - (time-prev_time)/2, None, tweet_count))
                conn2.commit()
                conn2.close()
                prev_time = time
                sentiments = []
                tweet_count = 0
            elif tweet_count >= min_tweets: # check if we have a reasonable number of tweets to get a mean sentiment from
                conn2 = sqlite3.connect(sqlitefile_downsampled)
                c2 = conn2.cursor()
                c2.execute('''INSERT INTO '''+candidate+'''_downsampled(datetime, sentiment, tweet_count) VALUES (?,?,?);''',
                         (time - (time-prev_time)/2, np.mean(sentiments), tweet_count))
                conn2.commit()
                conn2.close()
                prev_time = time
                sentiments = []
                tweet_count = 0
    conn.close()

In [None]:
# pseudocode
if not row:
    # table is empty
    # start from the top
    
    
    
else:
    if current_time - last_data_entry.datetime > bin_size:
        if current_time - last_data_entry.datetime > bin_size*2:
            # we're missing data, add an empty entry
            tweet_count = number_of_rows
            c.execute('''INSERT INTO sanders_downsampled(datetime, sentiment, tweet_count) VALUES (?,?,?)''',
                     (now-time_diff*0.5, None, tweet_count))
        else:
            # add entry
            mean_sentiment = np.mean(sentiments)
            tweet_count = number_of_rows
            c.execute('''INSERT INTO sanders_downsampled(datetime, sentiment, tweet_count) VALUES (?,?,?)''',
                     (now-time_diff*0.5, mean_sentiment, tweet_count))