In [220]:
import json
import numpy as np
import pandas as pd
import time
import datetime

In [221]:
import os
os.chdir("/home/composersyf/Documents/Political Data Science Project/OtherData")

In [222]:
with open("2016-national-gop-primary.json") as json_file:
    republican_primary=json_file.readlines()[0]
    republican_primary=json.loads(republican_primary)
    
with open("2016-national-democratic-primary.json") as json_file:
    democratic_primary=json_file.readlines()[0]
    democratic_primary=json.loads(democratic_primary)
    
with open("2016-general-election-trump-vs-clinton.json") as json_file:
    trump_vs_clinton=json_file.readlines()[0]
    trump_vs_clinton=json.loads(trump_vs_clinton)
    
with open("2016-general-election-trump-vs-clinton-vs-johnson.json") as json_file:
    general_election=json_file.readlines()[0]
    general_election=json.loads(general_election)    

In [223]:
def json_to_df(json_data):
    primary=json_data["estimates_by_date"]
    primary_dict={}
    for date in primary:
        primary_dict[date['date']]={}
        for c in date['estimates']:
            primary_dict[date['date']][c['choice']]=c['value']
    return pd.DataFrame.from_dict(primary_dict,orient="index").reset_index([0]).rename(columns={"index":"date"}).sort_values(["date"])

In [224]:
republican_primary_polls=json_to_df(republican_primary)
democratic_primary_polls=json_to_df(democratic_primary)
trump_vs_clinton_polls=json_to_df(trump_vs_clinton)
general_election_polls=json_to_df(general_election)

In [225]:
day_to_secs=3600*24

In [226]:
def convert_to_timestamp(date):
    return time.mktime(datetime.datetime.strptime(date,"%Y-%m-%d").utctimetuple())

In [227]:
def convert_to_date(timestamp):
    regular_date=datetime.datetime.utcfromtimestamp(timestamp)
    return str(regular_date.year)+"-"+str(regular_date.month).zfill(2)+"-"+str(regular_date.day).zfill(2)

In [228]:
republican_primary_polls['date_unix']=republican_primary_polls.date.apply(convert_to_timestamp)
democratic_primary_polls['date_unix']=democratic_primary_polls.date.apply(convert_to_timestamp)
trump_vs_clinton_polls['date_unix']=trump_vs_clinton_polls.date.apply(convert_to_timestamp)
general_election_polls['date_unix']=general_election_polls.date.apply(convert_to_timestamp)

In [229]:
def determine_start_and_end_time(candidate,primary_polls):
    values=list(primary_polls[candidate])
    for i,v in enumerate(values):
        if pd.isnull(v)==False:
            start_index=i
            break
    for i in range(len(values)-1,-1,-1):
        if pd.isnull(v)==False:
            end_index=i
            break
    return (primary_polls.date_unix[start_index],primary_polls.date_unix[end_index])

def calc_day_index(df):
    start_time=df.iloc[0,1]
    df['day_index']=((df.iloc[:,0]-df.iloc[0,0])/day_to_secs).astype(int)
    return df

def single_interpolation(df):
    df=calc_day_index(df)
    df_new=[]
    day_index=list(df.day_index)
    for i in range(len(day_index)-1):
        days=day_index[i+1]-day_index[i]
        time_0=df.iloc[i,0]
        value_0=df.iloc[i,1]
        delta_time=(df.iloc[i+1,0]-df.iloc[i,0])/days
        delta_value=(df.iloc[i+1,1]-df.iloc[i,1])/days
        for d in range(days):
            df_new.append((time_0+d*delta_time,value_0+d*delta_value))
    df_new.append((df.iloc[len(day_index)-1,0],df.iloc[len(day_index)-1,1]))
    df_new=pd.DataFrame(df_new)
    df_new.columns=["date_unix",df.columns.values[1]]
    df_new['date_unix']=df_new['date_unix'].astype(int)
    df_new=df_new.set_index(['date_unix'])
    return df_new

def combine_results(df_new):
    return pd.concat(df_new,join="outer",axis=1,ignore_index=False)

In [230]:
def interpolation(primary_polls):
    all_candidates=primary_polls.columns.values[1:-1]
    new_polls_values=[]
    for c in all_candidates:
        time_range=determine_start_and_end_time(c,primary_polls)
        df=primary_polls[(primary_polls.date_unix>=time_range[0]) & (primary_polls.date_unix<=time_range[1]) & (pd.isnull(primary_polls[c])==False)].loc[:,["date_unix",c]]
        new_polls_values.append(single_interpolation(df))
    results=combine_results(new_polls_values)
    results=results.loc[:,sorted(results.columns.values)]
    results=results.reset_index([0])
    results['date_unix']=results['date_unix'].apply(convert_to_date)
    results=results.rename(columns={"date_unix":"date"})
    return results

In [231]:
republican_results=interpolation(republican_primary_polls)
democratic_results=interpolation(democratic_primary_polls)
trump_vs_clinton_results=interpolation(trump_vs_clinton_polls)
general_election_results=interpolation(general_election_polls)

In [233]:
republican_results.to_csv("Republican_Primary_Polls.csv",index=False)
democratic_results.to_csv("Democratic_Primary_Polls.csv",index=False)
trump_vs_clinton_results.to_csv("Trump_vs_Clinton_Polls.csv",index=False)
general_election_results.to_csv("2016_General_Election_Polls.csv",index=False)