In [206]:
import pandas as pd
import numpy as np

file_path = "2016-general-election-trump-vs-clinton.csv"
df = pd.read_csv(file_path)


In [207]:
import numpy as np

df_clean = df.dropna(subset=["Number of Observations"])

df_clean["Clinton"] = df_clean["Clinton"] / 100
df_clean["Trump"] = df_clean["Trump"] / 100

sample_size = 40
sample = df_clean.sample(n=sample_size, replace=True)

total_voters = sample["Number of Observations"].sum()
p_hat = (sample["Clinton"] * sample["Number of Observations"]).sum() / sample["Number of Observations"].sum()

SE = np.sqrt((p_hat * (1 - p_hat)) / total_voters)

Z = 1.96  # Z-score for 95% confidence level
CI_lower = p_hat - Z * SE
CI_upper = p_hat + Z * SE

print(f"Estimated proportion (p-hat): {p_hat:.4f}")
print(f"Standard Error (SE): {SE:.6f}")
print(f"95% Confidence Interval: ({CI_lower:.4f}, {CI_upper:.4f})")


Estimated proportion (p-hat): 0.4411
Standard Error (SE): 0.001527
95% Confidence Interval: (0.4381, 0.4441)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["Clinton"] = df_clean["Clinton"] / 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["Trump"] = df_clean["Trump"] / 100


### Q2. Suppose the true population proportion 𝑝 = 0.47. Perform a Monte Carlo simulation with 𝑁 = 30 and 10^5 iterations to show that the CI derived in Question 1 captures the true proportion p approximately 95% of the time

In [208]:
sample_size = 30 
num_iterations = 1000 

number_of_p_hat_sample_in_95_interval = 0

for _ in range(num_iterations):
    sample = df_clean.sample(n=sample_size, replace=True)

    p_hat_sample = (sample["Clinton"] * sample["Number of Observations"]).sum() / sample["Number of Observations"].sum()

    if(CI_lower <= p_hat_sample and p_hat_sample <= CI_upper):
        number_of_p_hat_sample_in_95_interval += 1

print((number_of_p_hat_sample_in_95_interval / num_iterations) * 100, "%") 

9.9 %


### Q3. Load the data from the dataset into your coding workspace, and then make a data frame containing only the columns Trump, Clinton, Pollster, Start Date, Number of Observations, and Mode. Exclude any rows where the Number of Observations is missing.


In [209]:
import pandas as pd

columns_to_keep = ["Trump", "Clinton", "Pollster", "Start Date", "Number of Observations", "Mode"]
df_selected_clean = df_clean[columns_to_keep]

print(df_selected_clean.head())

df_selected_clean.to_csv("filtered_polling_data.csv", index=False)


   Trump  Clinton          Pollster  Start Date  Number of Observations  \
0   0.41     0.45     Insights West  2016-11-04                   940.0   
4   0.43     0.41          IBD/TIPP  2016-11-04                  1107.0   
5   0.41     0.45  YouGov/Economist  2016-11-04                  3669.0   
6   0.06     0.90  YouGov/Economist  2016-11-04                  1392.0   
7   0.84     0.04  YouGov/Economist  2016-11-04                  1110.0   

         Mode  
0    Internet  
4  Live Phone  
5    Internet  
6    Internet  
7    Internet  


In [210]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Load the dataset
# file_path = "2016-general-election-trump-vs-clinton.csv"  # Update with the correct file path
# df = pd.read_csv(file_path)

# # Select required columns and clean data
# columns_to_keep = ["Trump", "Clinton", "Start Date", "End Date"]
# df_selected = df[columns_to_keep].dropna()

# # Convert "Start Date" and "End Date" to datetime format
# df_selected["Start Date"] = pd.to_datetime(df_selected["Start Date"])
# df_selected["End Date"] = pd.to_datetime(df_selected["End Date"])

# # Compute the midpoint of the polling period as the best estimate for the actual poll date
# df_selected["Poll Date"] = df_selected["Start Date"] + (df_selected["End Date"] - df_selected["Start Date"]) / 2

# # Sort data by the computed "Poll Date"
# df_selected = df_selected.sort_values(by="Poll Date")

# # Set plot style
# sns.set_style("whitegrid")

# # Create the plot
# plt.figure(figsize=(12, 6))
# sns.lineplot(x=df_selected["Poll Date"], y=df_selected["Trump"], label="Trump Support (%)", color="red")
# sns.lineplot(x=df_selected["Poll Date"], y=df_selected["Clinton"], label="Clinton Support (%)", color="blue")

# # Add trend lines using LOWESS (locally weighted regression)
# sns.regplot(x=df_selected["Poll Date"].map(pd.Timestamp.toordinal), y=df_selected["Trump"], 
#             scatter=False, lowess=True, color="red", label="Trump Trend Line")
# sns.regplot(x=df_selected["Poll Date"].map(pd.Timestamp.toordinal), y=df_selected["Clinton"], 
#             scatter=False, lowess=True, color="blue", label="Clinton Trend Line")

# # Customize the plot
# plt.xlabel("Date")
# plt.ylabel("Support Percentage")
# plt.title("Time-Series Poll Results: Trump vs. Clinton")
# plt.legend()
# plt.xticks(rotation=45)

# # Show plot
# plt.show()


In [211]:
total_voters_observed = df_selected_clean["Number of Observations"].sum()

print(f"Total number of voters observed: {total_voters_observed}")

Total number of voters observed: 1940931.0


In [212]:
total_voters = df_clean["Number of Observations"].sum()

p_hat_trump = (df_clean["Trump"] * df_clean["Number of Observations"]).sum() / total_voters
p_hat_clinton = (df_clean["Clinton"] * df_clean["Number of Observations"]).sum() / total_voters

proportion_table = pd.DataFrame({
    "Candidate": ["Trump", "Clinton"],
    "Estimated Proportion": [p_hat_trump, p_hat_clinton]
})

print(proportion_table)


  Candidate  Estimated Proportion
0     Trump              0.405834
1   Clinton              0.456121
