In [ ]:
#### Preamble ####
# Purpose: This script is used as a sandbox to review the data before clean the raw data and preparing it for analysis.
# Author: Aman Rana, Shanjie Jiao, Kevin Shen
# Date: 25 October 2024
# Contact: aman.rana@mail.utoronto.ca
# License: MIT
# Pre-requisites: `pandas` must be installed (pip install pandas)
# `Jupyter Notebook` must be installed (pip install jupyter)

In [313]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [314]:
df = pd.read_csv('../data/01-raw_data/raw_data.csv')
df = df[df['numeric_grade'] >= 3.0]
df['pct'] = df['pct'] / 100

In [315]:
df.head()

Unnamed: 0,poll_id,pollster_id,pollster,sponsor_ids,sponsors,display_name,pollster_rating_id,pollster_rating_name,numeric_grade,pollscore,...,stage,nationwide_batch,ranked_choice_reallocated,ranked_choice_round,hypothetical,party,answer,candidate_id,candidate_name,pct
128,88762,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,0.48
129,88762,568,YouGov,133.0,CBS News,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,REP,Trump,16651,Donald Trump,0.51
146,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,DEM,Harris,16661,Kamala Harris,0.48
147,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,REP,Trump,16651,Donald Trump,0.45
148,88710,568,YouGov,352.0,Economist,YouGov,391,YouGov,3.0,-1.1,...,general,False,False,,False,GRE,Stein,31116,Jill Stein,0.01


In [316]:
df['pollster'].value_counts()

pollster
YouGov                                      843
Siena/NYT                                   674
Marquette Law School                        278
YouGov Blue                                  35
ABC/Washington Post                          18
The Washington Post                          14
YouGov/Center for Working Class Politics      2
McCourtney Institute/YouGov                   2
Name: count, dtype: int64

In [317]:
df.columns

Index(['poll_id', 'pollster_id', 'pollster', 'sponsor_ids', 'sponsors',
       'display_name', 'pollster_rating_id', 'pollster_rating_name',
       'numeric_grade', 'pollscore', 'methodology', 'transparency_score',
       'state', 'start_date', 'end_date', 'sponsor_candidate_id',
       'sponsor_candidate', 'sponsor_candidate_party', 'endorsed_candidate_id',
       'endorsed_candidate_name', 'endorsed_candidate_party', 'question_id',
       'sample_size', 'population', 'subpopulation', 'population_full',
       'tracking', 'created_at', 'notes', 'url', 'url_article', 'url_topline',
       'url_crosstab', 'source', 'internal', 'partisan', 'race_id', 'cycle',
       'office_type', 'seat_number', 'seat_name', 'election_date', 'stage',
       'nationwide_batch', 'ranked_choice_reallocated', 'ranked_choice_round',
       'hypothetical', 'party', 'answer', 'candidate_id', 'candidate_name',
       'pct'],
      dtype='object')

In [318]:
#relevant columns: 'pct', 'state', 'created_at', 'poll_score', 'partisan', 'pollster_id', 'numeric_grade', 
# 'party'

df_lean = df[['pct', 'state', 'start_date', 'end_date' , 'pollscore', 'partisan', 'pollster_id', 'pollster' ,'party', 'sample_size']]
df_lean['partisan'].fillna('Non-Partisan', inplace=True)
df_lean['start_date'] = pd.to_datetime(df_lean['start_date'])
df_lean['end_date'] = pd.to_datetime(df_lean['end_date'])
df_lean = df_lean[df_lean['start_date'] >= '2024-07-21'].reset_index(drop=True)
df_lean.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_lean['partisan'].fillna('Non-Partisan', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lean['partisan'].fillna('Non-Partisan', inplace=True)
  df_lean['start_date'] = pd.to_datetime(df_lean['start_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

Unnamed: 0,pct,state,start_date,end_date,pollscore,partisan,pollster_id,pollster,party,sample_size
0,0.48,Arizona,2024-10-11,2024-10-16,-1.1,Non-Partisan,568,YouGov,DEM,1435.0
1,0.51,Arizona,2024-10-11,2024-10-16,-1.1,Non-Partisan,568,YouGov,REP,1435.0
2,0.48,,2024-10-12,2024-10-15,-1.1,Non-Partisan,568,YouGov,DEM,1457.0
3,0.45,,2024-10-12,2024-10-15,-1.1,Non-Partisan,568,YouGov,REP,1457.0
4,0.01,,2024-10-12,2024-10-15,-1.1,Non-Partisan,568,YouGov,GRE,1457.0


In [319]:
df_lean['state'].value_counts()
#We notice Maine CD-1 and Maine CD-2 are separate states.
#We notice Nebraska CD-1, Nebraska CD-2. 
#We retain these as separate states as they are characteristicaly different.

state
Wisconsin         102
Pennsylvania       66
Michigan           46
Arizona            42
North Carolina     38
Georgia            30
Texas              22
Ohio               16
Nevada             16
Montana            12
Florida            12
Nebraska CD-2      12
Missouri            2
Name: count, dtype: int64

In [320]:
print(df_lean.isna().sum() / len(df_lean))

pct            0.000000
state          0.353033
start_date     0.000000
end_date       0.000000
pollscore      0.000000
partisan       0.000000
pollster_id    0.000000
pollster       0.000000
party          0.000000
sample_size    0.037325
dtype: float64


In [321]:
df_lean['partisan'].value_counts()
#drop partisan polls.

partisan
Non-Partisan    637
DEM               6
Name: count, dtype: int64

In [322]:
df_lean = df_lean[df_lean['partisan'] == 'Non-Partisan'].reset_index(drop=True)

In [323]:
df_lean['pollscore'].value_counts()

pollscore
-1.5    314
-1.1    309
-1.2     14
Name: count, dtype: int64

In [324]:
df_lean['pollster_id'].value_counts()

pollster_id
1424    314
568     217
1075     88
486      14
1901      2
1700      2
Name: count, dtype: int64

In [325]:
#drop id 1492, 1704, 486, and 1700 for having very few polls.
df_lean = df_lean[~df_lean['pollster_id'].isin([1492, 1704, 486, 1700, 1901])].reset_index(drop=True)

In [326]:
df_lean.describe()

Unnamed: 0,pct,start_date,end_date,pollscore,pollster_id,sample_size
count,619.0,619,619,619.0,619.0,595.0
mean,0.264782,2024-08-31 05:07:04.555735040,2024-09-04 21:21:48.562197248,-1.302908,1074.300485,1031.136134
min,0.0,2024-07-21 00:00:00,2024-07-23 00:00:00,-1.5,568.0,500.0
25%,0.01,2024-08-09 00:00:00,2024-08-14 00:00:00,-1.5,568.0,677.0
50%,0.42,2024-09-01 00:00:00,2024-09-05 00:00:00,-1.5,1424.0,798.0
75%,0.47,2024-09-21 00:00:00,2024-09-26 00:00:00,-1.1,1424.0,1195.5
max,0.57,2024-10-12 00:00:00,2024-10-16 00:00:00,-1.1,1424.0,3385.0
std,0.225898,,,0.200141,390.056099,595.084212


In [327]:
df_lean['end_date'].value_counts()

end_date
2024-09-26    68
2024-08-23    48
2024-10-10    44
2024-09-21    36
2024-08-01    34
2024-08-08    32
2024-08-14    32
2024-08-15    32
2024-10-04    26
2024-09-16    24
2024-08-26    19
2024-09-06    18
2024-09-05    18
2024-10-06    18
2024-08-09    16
2024-07-24    16
2024-09-13    14
2024-10-08    12
2024-08-16    10
2024-07-23    10
2024-09-24    10
2024-10-15     8
2024-08-02     8
2024-10-01     8
2024-10-07     8
2024-08-31     6
2024-08-06     5
2024-07-30     5
2024-08-20     5
2024-08-13     5
2024-09-03     4
2024-08-27     4
2024-09-10     4
2024-09-17     4
2024-08-22     2
2024-09-20     2
2024-10-11     2
2024-10-16     2
Name: count, dtype: int64

In [329]:
list(df_lean['state'].unique())

['Arizona',
 nan,
 'Pennsylvania',
 'Texas',
 'Montana',
 'Florida',
 'Nebraska CD-2',
 'Michigan',
 'Ohio',
 'Wisconsin',
 'Georgia',
 'North Carolina',
 'Missouri',
 'Nevada']