# Partisan domain analysis

In [28]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

## Prepare data

In [29]:
qry_domain = pd.read_csv('/net/lazer/lab-lazer/shared_projects/google_audit_reproduce/intermedidate_files/merged_summary/cleaned_house_qry_domain.csv')

In [30]:
politician_info_dir = "../data/qry_info.csv"
politician_info = pd.read_csv(politician_info_dir)

In [31]:
qry_party_df = pd.merge(qry_domain, politician_info[['qry', 'party']].drop_duplicates(), how="left", on='qry')

In [32]:
qry_party_df['counts'].sum()

np.int64(660900513)

In [33]:
qry_party_df = qry_party_df.dropna()

In [34]:
qry_party_df['counts'].sum()

np.int64(633984769)

In [35]:
party_domain = qry_party_df.groupby(['domain', 'party'], dropna=True, as_index=False)['counts'].sum()

In [36]:
party_domain['counts'].sum()

np.int64(633984769)

In [37]:
party_domain.to_csv('../data/house_analysis/house_party_domain_no_duplicate.csv', index=False)  

## Basic stats analysis

In [38]:
sum_all_domain = party_domain['counts'].sum()

In [39]:
sum_all_domain

np.int64(633984769)

In [40]:
dem_domain = party_domain[party_domain['party']=='Democrat']

In [41]:
dem_domain = dem_domain.sort_values('counts', ascending=False)

In [42]:
sum_dem = dem_domain['counts'].sum()
dem_domain["proportion"] = dem_domain['counts'] / sum_dem

In [43]:
sum_dem

np.int64(351406517)

In [44]:
sum_dem/sum_all_domain

np.float64(0.5542822701470924)

In [45]:
rep_domain = party_domain[party_domain['party']=='Republican']
rep_domain = rep_domain.sort_values('counts', ascending=False)
sum_rep = rep_domain['counts'].sum()
rep_domain["proportion"] = rep_domain['counts'] / sum_rep

In [46]:
sum_rep

np.int64(280077280)

In [47]:
sum_rep/sum_all_domain

np.float64(0.4417728842946383)

In [48]:
dem_rep_prop = pd.merge(dem_domain[['domain', 'proportion']], rep_domain[['domain', 'proportion']], how="outer", on='domain')

In [49]:
# Apply the function
dem_rep_prop['proportion_x'] = dem_rep_prop['proportion_x'].fillna(0)
dem_rep_prop['proportion_y'] = dem_rep_prop['proportion_y'].fillna(0)

In [50]:
dem_rep_prop.rename(columns={"proportion_x": "dem_prop", "proportion_y": "rep_prop"}, inplace=True)

In [51]:
dem_rep_prop

Unnamed: 0,domain,dem_prop,rep_prop
0,Katko,0.000000e+00,3.570443e-09
1,100e53.com,2.561136e-08,0.000000e+00
2,1011now.com,2.148793e-05,4.551387e-04
3,101espn.com,0.000000e+00,1.553143e-06
4,1057news.com,2.845707e-09,0.000000e+00
...,...,...,...
8846,zimmerlaw.com,1.138283e-08,0.000000e+00
8847,zocdoc.com,8.821692e-08,0.000000e+00
8848,zoelofgren.com,2.409289e-04,0.000000e+00
8849,zoominfo.com,1.337482e-07,0.000000e+00


In [52]:
dem_rep_prop.to_csv('../data/house_analysis/domain_dem_rep_prop.csv', index=False)