In [12]:
import pandas as pd
import requests
import re
import os

URL = "https://www.realclearpolling.com/latest-polls/president/general-election"

if 'response' not in locals():
    response = requests.get(URL)
    print('Importing')
html = response.content
html_str = html.decode('utf-8')

#Restrict to poll table onwards
poll_html = html_str.split('feed\\":',1)
#Remove tail-end script
combined_polls = poll_html[1].split('],\\"')[0]

#Split out by day
polls = re.split(r'(?={\\"date\\")',combined_polls)[1:]
#Split by individual poll
poll = [poll.split('",\\"') for poll in polls]

#Convert to DataFrame
polls_df = pd.DataFrame(poll)

#Get lists of candidates
concatenated_candidates = [','.join(map(str, sublist[9:])) for sublist in poll]
for sublist in poll:
    del sublist[9:]
for i in range(len(poll)):
    poll[i][8]=concatenated_candidates[i]

#Rename Columns
polls_df = pd.DataFrame(poll, columns=["Date","Pollster","Pollster_URL","Race","Race_URL","Poll_ID","Spread","Spread_Affiliation","Candidates"])

#Clean up Columns
cleaned_polls_df=polls_df.copy()
cleaned_polls_df['Date']=polls_df['Date'].str[12:].str[:16]
cleaned_polls_df['Pollster']=polls_df['Pollster'].str[13:].str[:-1]
cleaned_polls_df['Pollster_URL']=polls_df['Pollster_URL'].str[17:].str[:-1]
cleaned_polls_df['Race']=polls_df['Race'].str[9:].str[:-1]
cleaned_polls_df['Race_URL']=polls_df['Race_URL'].str[9:].str[:-1]
cleaned_polls_df['Poll_ID']=polls_df['Poll_ID'].str[12:].str[:-1]
cleaned_polls_df['Spread']=polls_df['Spread'].str[11:].str[:-1]
cleaned_polls_df['Spread_Affiliation']=polls_df['Spread_Affiliation'].str[15:].str[:-1]
cleaned_polls_df['Candidates']=polls_df['Candidates'].str[13:].str[:-5]
split_candidates = cleaned_polls_df['Candidates'].str.split(r'\\"},{\\"name',expand=True)
cleaned_polls_df = pd.concat([cleaned_polls_df,split_candidates],axis=1)
cleaned_polls_df = cleaned_polls_df.drop(columns=['Candidates'])
cleaned_polls_df[0]=cleaned_polls_df[0].str[12:]
cleaned_polls_df[1]=cleaned_polls_df[1].str[5:]
cleaned_polls_df[2]=cleaned_polls_df[2].str[5:]
cleaned_polls_df[3]=cleaned_polls_df[3].str[5:]
cleaned_polls_df[4]=cleaned_polls_df[4].str[5:]
cleaned_polls_df[5]=cleaned_polls_df[5].str[5:]

meltme_df = cleaned_polls_df.copy()

#Transpose candidate positions and values to new rows
melted_df = pd.melt(meltme_df, id_vars=['Date', 'Pollster', 'Pollster_URL', 'Race', 'Race_URL', 'Poll_ID', 'Spread', 'Spread_Affiliation'], var_name='Number', value_name='Value')
melted_df = melted_df.dropna(subset=['Value'])
melted_df['Poll_Date'] = pd.to_datetime(melted_df['Date'], format='%a, %d %b %Y')
melted_df = melted_df.sort_values(by=['Poll_Date', 'Poll_ID'], ascending=[False, True])
melted_df['Number'] = melted_df['Number']+1
melted_df = melted_df.rename(columns={'Number':'Position'})

#String editing
melted_df[['Candidate','remove1','remove2','Value','remove3']] = melted_df['Value'].str.split('\\',expand=True)
melted_df = melted_df.drop(columns=['remove1','remove2','remove3'])
melted_df['Value'] = melted_df['Value'].str.replace('"','')

#Extract State from Race
state_extract = melted_df['Race'].str.split(':',expand=True)
melted_df['State'] = state_extract[0]
melted_df['State'] = melted_df['State'].str.replace(' CD',"-")

#Filter to only new data compared to currently used file (if it exists)
previous_data_exists = os.path.isfile('Election2024PollingResults.csv')==True
if previous_data_exists:
    old_df = pd.read_csv('Election2024PollingResults.csv')
    old_df['Date_New'] = pd.to_datetime(old_df['Date'], format='%a, %d %b %Y')
    melted_df['Date_New'] = pd.to_datetime(melted_df['Date'], format='%a, %d %b %Y')
    new_df = melted_df[melted_df['Date_New']>old_df['Date_New'].max()]   
    #Reset Indices and concatenate new data to old
    new_df = new_df.reset_index(drop=True)
    old_df = old_df.reset_index(drop=True)
    combined_all_df = pd.concat([new_df,old_df],axis=0)
    #Remove unnecessary columns
    combined_all_df=combined_all_df.drop(['Date_New','Unnamed: 0'],axis=1)
else:
    #Only use new data if running for the first time
    combined_all_df = melted_df
combined_all_df
#combined_all_df.to_csv('Election2024PollingResults.csv',index=True)

Unnamed: 0,Date,Pollster,Pollster_URL,Race,Race_URL,Poll_ID,Spread,Spread_Affiliation,Position,Value,Poll_Date,Candidate,State
0,"Tue, 02 Apr 2024",Morning Consult,https://pro.morningconsult.com/trackers/2024-p...,General Election: Trump vs. Biden,https://www.realclearpolitics.com/epolls/2024/...,7383,Biden +2,Democrat,1,42,02/04/2024 00:00,Trump,General Election
1,"Tue, 02 Apr 2024",Morning Consult,https://pro.morningconsult.com/trackers/2024-p...,General Election: Trump vs. Biden,https://www.realclearpolitics.com/epolls/2024/...,7383,Biden +2,Democrat,2,44,02/04/2024 00:00,Biden,General Election
2,"Tue, 02 Apr 2024",Trafalgar Group (R),https://www.thetrafalgargroup.org/wp-content/u...,General Election: Trump vs. Biden vs. Kennedy ...,https://www.realclearpolitics.com/epolls/2024/...,8329,Trump +3,Republican,1,43,02/04/2024 00:00,Trump,General Election
3,"Tue, 02 Apr 2024",Trafalgar Group (R),https://www.thetrafalgargroup.org/wp-content/u...,General Election: Trump vs. Biden vs. Kennedy ...,https://www.realclearpolitics.com/epolls/2024/...,8329,Trump +3,Republican,2,40,02/04/2024 00:00,Biden,General Election
4,"Tue, 02 Apr 2024",Trafalgar Group (R),https://www.thetrafalgargroup.org/wp-content/u...,General Election: Trump vs. Biden vs. Kennedy ...,https://www.realclearpolitics.com/epolls/2024/...,8329,Trump +3,Republican,3,11,02/04/2024 00:00,Kennedy,General Election
...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,"Wed, 21 Feb 2024",The Hill/Emerson,https://emersoncollegepolling.com/georgia-2024...,Georgia: Trump vs. Newsom,https://www.realclearpolitics.com/epolls/2024/...,8392,Trump +19,Republican,2,32,21/02/2024,Newsom,Georgia
501,"Wed, 21 Feb 2024",The Hill/Emerson,https://emersoncollegepolling.com/north-caroli...,North Carolina: Trump vs. Newsom,https://www.realclearpolitics.com/epolls/2024/...,8393,Trump +15,Republican,1,49,21/02/2024,Trump,North Carolina
502,"Wed, 21 Feb 2024",The Hill/Emerson,https://emersoncollegepolling.com/north-caroli...,North Carolina: Trump vs. Newsom,https://www.realclearpolitics.com/epolls/2024/...,8393,Trump +15,Republican,2,34,21/02/2024,Newsom,North Carolina
503,"Wed, 21 Feb 2024",The Hill/Emerson,https://emersoncollegepolling.com/north-caroli...,North Carolina: Trump vs. Harris,https://www.realclearpolitics.com/epolls/2024/...,8394,Trump +9,Republican,1,50,21/02/2024,Trump,North Carolina
