# Partisan domain analysis

In [57]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## Prepare data

In [58]:
qry_domain = pd.read_csv('/net/lazer/lab-lazer/shared_projects/google_audit_reproduce/intermedidate_files/merged_summary/qry_domain.csv')

In [59]:
politician_info_dir = "../data/qry_info_house_filter.csv"
politician_info = pd.read_csv(politician_info_dir)

In [60]:
qry_party_df = pd.merge(qry_domain, politician_info[['qry', 'party']].drop_duplicates(), how="left", on='qry')

In [61]:
qry_party_df['counts'].sum()

299573378

In [62]:
qry_party_df = qry_party_df.dropna()

In [63]:
qry_party_df['counts'].sum()

299573378

In [64]:
party_domain = qry_party_df.groupby(['domain', 'party'], dropna=True, as_index=False)['counts'].sum()

In [65]:
party_domain['counts'].sum()

299573378

In [66]:
party_domain

Unnamed: 0,domain,party,counts
0,1011now.com,Democrat,5392
1,1011now.com,Republican,79449
2,10news.com,Democrat,10770
3,10tv.com,Democrat,22513
4,10tv.com,Republican,19984
...,...,...,...
5558,yubanet.com,Democrat,2333
5559,zanesvilletimesrecorder.com,Republican,4882
5560,zeldinforcongress.com,Republican,47752
5561,zerotothree.org,Democrat,31


In [67]:
party_domain.to_csv('../data/house_analysis/house_party_domain_no_duplicate.csv', index=False)  

## Basic stats analysis

In [68]:
sum_all_domain = party_domain['counts'].sum()

In [69]:
sum_all_domain

299573378

In [70]:
dem_domain = party_domain[party_domain['party']=='Democrat']

In [71]:
dem_domain = dem_domain.sort_values('counts', ascending=False)

In [72]:
sum_dem = dem_domain['counts'].sum()
dem_domain["proportion"] = dem_domain['counts'] / sum_dem

In [73]:
sum_dem

167929247

In [74]:
sum_dem/sum_all_domain

0.5605613159657998

In [75]:
rep_domain = party_domain[party_domain['party']=='Republican']
rep_domain = rep_domain.sort_values('counts', ascending=False)
sum_rep = rep_domain['counts'].sum()
rep_domain["proportion"] = rep_domain['counts'] / sum_rep

In [76]:
sum_rep

130919252

In [77]:
sum_rep/sum_all_domain

0.43701897970386405

In [78]:
dem_rep_prop = pd.merge(dem_domain[['domain', 'proportion']], rep_domain[['domain', 'proportion']], how="outer", on='domain')

In [79]:
# Apply the function
dem_rep_prop['proportion_x'] = dem_rep_prop['proportion_x'].fillna(0)
dem_rep_prop['proportion_y'] = dem_rep_prop['proportion_y'].fillna(0)

In [80]:
dem_rep_prop.rename(columns={"proportion_x": "dem_prop", "proportion_y": "rep_prop"}, inplace=True)

In [81]:
dem_rep_prop

Unnamed: 0,domain,dem_prop,rep_prop
0,1011now.com,3.210876e-05,0.000607
1,10news.com,6.413415e-05,0.000000
2,10tv.com,1.340624e-04,0.000153
3,11alive.com,3.173479e-04,0.000356
4,12news.com,9.466487e-05,0.000073
...,...,...,...
4214,yubanet.com,1.389276e-05,0.000000
4215,zanesvilletimesrecorder.com,0.000000e+00,0.000037
4216,zeldinforcongress.com,0.000000e+00,0.000365
4217,zerotothree.org,1.846016e-07,0.000000


In [82]:
dem_rep_prop.to_csv('../data/house_analysis/domain_dem_rep_prop.csv', index=False)