## Import package

In [None]:
!pip install factor_analyzer pingouin stargazer

In [183]:
# for data ETL
import pandas as pd
import numpy as np
import ETL_function
# for data EPA
import plotly.express as px
import seaborn as sns               
import matplotlib.pyplot as plt   
# for establish index
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
import pingouin as pg # index reliability testing
# for stats ml
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.miscmodels.ordinal_model import OrderedModel
from stargazer.stargazer import Stargazer

## Load data and filiter 

In [157]:
raw_data = pd.read_csv('raw_data.csv')

In [159]:
filter_data = raw_data.loc[raw_data['目前國內政黨當中，請問您是否偏向哪一個政黨？'].isin(['沒有特定支持', '都不支持'])]
filter_data.drop(['您有絶對的權力決定是否要參與本研究。若您願意參與，請務必勾選下列選項：', '請填寫您的電子信箱，以利後續抽獎聯繫使用'], axis=1, inplace=True)

## Data Cleaning

#### Recode variable for establish index

In [160]:
# TV_news_time : 請問您平均每天花多少時間注意電視上的選舉新聞？
filter_data['TV_news_time'] = filter_data['請問您平均每天花多少時間注意電視上的選舉新聞？'].apply(ETL_function.news_time)

# news_paper_time : 請問您平均每天花多少時間注意報紙上的選舉新聞？
filter_data['news_paper_time'] = filter_data['請問您平均每天花多少時間注意報紙上的選舉新聞？'].apply(ETL_function.news_time)

# int_news_time : 請問您平均每天花多少時間注意網路上的選舉新聞？
filter_data['int_news_time'] = filter_data['請問您平均每天花多少時間注意網路上的選舉新聞？'].apply(ETL_function.news_time)

# TV_debate : 今年九合一選舉中，我有觀看候選人電視辯論會。
filter_data['TV_debate'] = filter_data['今年九合一選舉中，我有觀看候選人電視辯論會。'].apply(ETL_function.TV_debate)

# read_media : 請問您有多常瀏覽候選人的社群媒體？
filter_data['read_media'] = filter_data['請問您有多常瀏覽候選人的社群媒體？'].apply(ETL_function.read_media)

# like_media : 請問您有多常在候選人的社群媒體貼文下按讚（任何表情符號）？
filter_data['like_media'] = filter_data['請問您有多常在候選人的社群媒體貼文下按讚（任何表情符號）？'].apply(ETL_function.like_media)

# share_media : 請問您有多常分享候選人的社群媒體貼文？
filter_data['share_media'] = filter_data['請問您有多常分享候選人的社群媒體貼文？'].apply(ETL_function.share_media)

# comment_media : 請問您有多常在候選人的社群媒體貼文底下留言？
filter_data['comment_media'] = filter_data['請問您有多常在候選人的社群媒體貼文底下留言？'].apply(ETL_function.comment_media)

# int_discuss : 請問您平時會不會在網路上與他人討論有關政治或選舉方面的議題？
filter_data['int_discuss'] = filter_data['請問您平時會不會在網路上與他人討論有關政治或選舉方面的議題？'].apply(ETL_function.int_discuss)

# read_election_news : 今年九合一選舉中，我有閱讀選舉公報。
filter_data['read_election_news'] = filter_data['今年九合一選舉中，我有閱讀選舉公報。'].apply(ETL_function.Likert)

# read_election_leaflet : 今年九合一選舉中，我有閱讀選舉公報。
filter_data['read_election_leaflet'] = filter_data['今年九合一選舉中，我有閱讀候選人傳單、快報或報刊廣告。'].apply(ETL_function.Likert)

# convince : 請問您有多常遊說或勸說親友投票給某位候選人？
filter_data['convince'] = filter_data['請問您有多常遊說或勸說親友投票給某位候選人？'].apply(ETL_function.convince)

# campaign : 今年九合一選舉中，我有主動參加造勢活動。
filter_data['campaign'] = filter_data['今年九合一選舉中，我有主動參加造勢活動。'].apply(ETL_function.Likert)

# volunteer : 今年九合一選舉中，我有擔任候選人或政黨的助選工作人員或義工。
filter_data['volunteer'] = filter_data['今年九合一選舉中，我有擔任候選人或政黨的助選工作人員或義工。'].apply(ETL_function.Likert)

# election_mayor : 請問今年縣市長選舉中，您是否可能去投票？
filter_data['election_mayor'] = filter_data['請問今年縣市長選舉中，您是否可能去投票？'].apply(ETL_function.election)

# election_18 : 請問今年18歲公民權修憲公投，您是否可能去投票？
filter_data['election_18'] = filter_data['請問今年18歲公民權修憲公投，您是否可能去投票？'].apply(ETL_function.election)

#### Recode DV variable

##### Demographic variables 

In [187]:
# sex
filter_data['sex'] = filter_data['受訪者性別'].apply(ETL_function.sex)

# ethnic
# combine ethnic and ethnic_other 
filter_data['台灣社會有多重的身份認同，有人認為自己是本省客家人、本省閩南人、大陸各省市人或原住民，請問您認為您的認同較接近哪一個?'] = filter_data.apply(lambda row: row['其他.6'] if row['台灣社會有多重的身份認同，有人認為自己是本省客家人、本省閩南人、大陸各省市人或原住民，請問您認為您的認同較接近哪一個?'] == '其他' else row['台灣社會有多重的身份認同，有人認為自己是本省客家人、本省閩南人、大陸各省市人或原住民，請問您認為您的認同較接近哪一個?'],axis=1)
filter_data['ethnic'] = filter_data['台灣社會有多重的身份認同，有人認為自己是本省客家人、本省閩南人、大陸各省市人或原住民，請問您認為您的認同較接近哪一個?'].apply(ETL_function.ethnic)

# edu
filter_data['edu'] = filter_data['請問您的教育程度是什麼？'].apply(ETL_function.edu)

# income 
filter_data['income'] = filter_data['請問您平均月收入大約是多少?'].apply(ETL_function.income)

##### Political Knowledge

In [None]:
# recode variables about political knowledge 
filter_data['pk_1'] = filter_data['請問您現任的美國總統是誰？'].apply(ETL_function.pk_1)
filter_data['pk_2'] = filter_data['請問您現任的行政院長是誰？'].apply(ETL_function.pk_2)
filter_data['pk_3'] = filter_data['請問您我國哪一個政府機關有權解釋憲法？'].apply(ETL_function.pk_3)
# sum three variables
filter_data['political_knowledge'] = filter_data['pk_1'] + filter_data['pk_2'] + filter_data['pk_3']

##### Anti Party Sentiment

In [None]:
anti_party_columns = ['您是否同意台灣民眾討厭國民黨及民進黨的比例，正在逐年上升中？', '有人說：「為了勝選，無論那一個政黨都會利用機會影響選舉的公平性」，請問您同不同意這種說法？', '有人說：「現在無論那一個政黨執政，都不可能把國家治理好」，請問您同不同意這種說法？', '有人說：「政黨總是相互批評，但實際上它們並無差別」，請問您同不同意這種說法？', '有人說：「政黨只會讓社會分裂？」，請問您同不同意這種說法？']

for j, i in enumerate(anti_party_columns, 1):  # enumerate starts at 1 instead of 0
    filter_data['anti_'+str(j)] = filter_data[i].apply(ETL_function.five_agree)

##### TW China Issue

In [None]:
filter_data['TC_issue'] = filter_data['有人說：「這次選舉中，抗中保台很重要」，請問您同不同意這種說法？'].apply(ETL_function.five_agree)

##### Negative media

In [None]:
filter_data['Negative_1'] = filter_data['在這次縣市長選舉過程中，有大量關於林智堅的論文爭議的報導，請問這些報導對您的投票選擇可能有什麼影響？']
filter_data['Negative_2'] = filter_data['在這次縣市長選舉過程中，有大量關於陳時中的防疫表現的報導，這些報導對您的投票選擇可能有什麼影響？']
filter_data['Negative_3'] = filter_data['在這次縣市長選舉過程中，有大量關於高虹安的助理門事件爭議的報導，這些報導對您的投票選擇有什麼影響？']

#### reshape data frame for modeling

In [161]:
ml_df = filter_data.loc[:, ['TV_news_time', 'news_paper_time',
       'int_news_time', 'TV_debate', 'read_media', 'like_media', 'share_media',
       'comment_media', 'int_discuss', 'read_election_news',
       'read_election_leaflet', 'convince', 'campaign', 'volunteer',
       'election_mayor', 'election_18', 'sex', 'ethnic', 'edu', 'income',
       'pk_1', 'pk_2', 'pk_3', 'political_knowledge', 'anti_1', 'anti_2',
       'anti_3', 'anti_4', 'anti_5', 'TC_issue', 'Negative_1', 'Negative_2',
       'Negative_3']]
ml_df = ml_df.dropna()

In [None]:
display(ml_df.shape, ml_df.columns)

### Bartlett’s test and Kaiser-Meyer-Olkin 

In [None]:
# Select specific variables for factor analysis
selected_variables = ['TV_news_time', 'news_paper_time', 'int_news_time', 'TV_debate', 'read_media', 'like_media', 'share_media', 'comment_media', 'int_discuss', 'read_election_news', 'read_election_leaflet', 'convince', 'campaign', 'volunteer', 'election_mayor', 'election_18']
ml_df_selected = ml_df[selected_variables]

In [None]:
# Bartlett’s test of sphericity checks whether or not the observed variables intercorrelate at all using the observed correlation matrix against the identity matrix.
chi_square_value,p_value=calculate_bartlett_sphericity(ml_df_selected)
chi_square_value, p_value

In [None]:
# Kaiser-Meyer-Olkin (KMO) Test measures the suitability of data for factor analysis.
kmo_all,kmo_model=calculate_kmo(ml_df_selected)
kmo_model

### 依變數的檢測結果
#### Bartlett的球形檢測結果：

* 卡方值 (Chi-square): 2616.18
* p值: 0.0 </b>

這個結果表示我們可以拒絕虛無假設（即，觀察到的變數之間沒有相關性）。換句話說，我們的數據集中的變數之間存在一定的相關性，這使得我們可以進行因素分析

#### Kaiser-Meyer-Olkin (KMO) 檢測的結果: 0.77

KMO 測試的值範圍為 0 到 1。值越接近 1，表示數據更適合進行因素分析。一般來說，KMO 值大於 0.6 被認為是適合進行因素分析的。

在這個案例中，KMO 值為 0.77，這表示我們的數據是適合進行因素分析的。


In [None]:
# Perform the factor analysis with 3 factors
fa = FactorAnalyzer(n_factors=4, rotation='varimax')
fa.fit(ml_df_selected)

# Get the factor loadings
loadings = fa.loadings_

# create a dataframe from the loadings
loadings_df = pd.DataFrame(loadings, index=ml_df_selected.columns)

# find the factor with the highest loading for each variable
factor_assignment_df = pd.DataFrame(loadings_df.idxmax(axis=1), columns = ['group_name'])

# find the maximum loading value for each variable
max_loading_df = pd.DataFrame(loadings_df.max(axis=1), columns = ['max_loading'])

# concatenate the factor assignments and max loading into one dataframe
factor_analysis_results_df = pd.concat([factor_assignment_df, max_loading_df], axis=1)

# sort the results by group_name
factor_analysis_results_df_sorted = factor_analysis_results_df.sort_values(by='group_name')

factor_analysis_results_df_sorted

留下 loading > 0.5 的

In [None]:
# filter the results to only include variables with max_loading > 0.5
factor_analysis_results_df_filtered = factor_analysis_results_df_sorted[factor_analysis_results_df_sorted['max_loading'] > 0.5]

factor_analysis_results_df_filtered

#### 由上述結果可以發現依變數可以分為4個構面：</b>

* 線上媒體政治參與(online_media_pp) : read_media / like_media / share_media / comment_media</b>

-> reference : 劉嘉薇，2019
* 投票參與(voting) : election_mayor / election_18</b>

-> reference : Barnes and  Kaase (1979) 

* 線下媒體政治參與(offline_media_pp) : read_election_news / read_election_leaflet

-> reference : 徐火炎，2001
* 競選工作式政治參與(campaign_worker_pp) : campaign / volunteer

-> reference : Mibrath and Goel，1977

## Establish Index and adjust with Z-score

### Establish index

#### Method 1：變數值*加載值

In [None]:
# Rename the columns of loadings_df
loadings_df.columns = ['Factor1', 'Factor2', 'Factor3', 'Factor4']

loadings_df.columns

In [None]:
online_media_pp_vars = ['read_media', 'like_media', 'share_media', 'comment_media']
voting_vars = ['election_mayor', 'election_18']
offline_media_pp_vars = ['read_election_news', 'read_election_leaflet']
campaign_worker_pp_vars = ['campaign', 'volunteer']
anti_party_vars = ['anti_1', 'anti_2', 'anti_3', 'anti_4', 'anti_5']

In [None]:
# Calculate the scores for each factor using factor loadings
ml_df['online_media_pp_score'] = (ml_df[online_media_pp_vars] * loadings_df.loc[online_media_pp_vars, 'Factor1']).sum(axis=1)
ml_df['voting_score'] = (ml_df[voting_vars] * loadings_df.loc[voting_vars, 'Factor2']).sum(axis=1)
ml_df['offline_media_pp_score'] = (ml_df[offline_media_pp_vars] * loadings_df.loc[offline_media_pp_vars, 'Factor3']).sum(axis=1)
ml_df['campaign_worker_pp_score'] = (ml_df[campaign_worker_pp_vars] * loadings_df.loc[campaign_worker_pp_vars, 'Factor4']).sum(axis=1)

# Initialize a StandardScaler
scaler = StandardScaler()

# Standardize the scores to have mean=0 and std=1
ml_df[['online_media_pp_score', 'voting_score', 'offline_media_pp_score', 'campaign_worker_pp_score']] = scaler.fit_transform(ml_df[['online_media_pp_score', 'voting_score', 'offline_media_pp_score', 'campaign_worker_pp_score']])

ml_df[['online_media_pp_score', 'voting_score', 'offline_media_pp_score', 'campaign_worker_pp_score']]

#### Method 2：直接取平均

In [None]:
# Calculate the scores for each factor
ml_df['online_media_pp'] = ml_df[online_media_pp_vars].mean(axis=1)
ml_df['voting'] = ml_df[voting_vars].mean(axis=1)
ml_df['offline_media_pp'] = ml_df[offline_media_pp_vars].mean(axis=1)
ml_df['campaign_worker_pp'] = ml_df[campaign_worker_pp_vars].mean(axis=1)
ml_df['anti_party'] = ml_df[anti_party_vars].mean(axis=1)

### Index testing

In [None]:
# Calculate descriptive statistics for the scores calculated by the two methods
score_vars = ['online_media_pp_score', 'voting_score', 'offline_media_pp_score', 'campaign_worker_pp_score']
mean_vars = ['online_media_pp', 'voting', 'offline_media_pp', 'campaign_worker_pp']

score_descriptive = ml_df[score_vars].describe()
mean_descriptive = ml_df[mean_vars].describe()

display(score_descriptive)
display(mean_descriptive)

兩種建立指標的方式之間相關性很高，代表Parallel forms reliability符合

In [None]:
# Define a function to calculate the reliability of a factor
def calculate_factor_reliability(factor_vars, factor_score):
    # Calculate the mean of the variables
    factor_mean = ml_df[factor_vars].mean(axis=1)
    # Calculate the Pearson correlation between the factor score and the mean of the variables
    r, _ = pearsonr(factor_mean, factor_score)
    # Calculate the reliability using the formula for Cronbach's alpha
    reliability = len(factor_vars) * r / (1 + (len(factor_vars) - 1) * r)
    return reliability

# Calculate the reliability for each factor
reliabilities_score = pd.Series(index=['online_media_pp_score', 'voting_score', 'offline_media_pp_score', 'campaign_worker_pp_score'])
reliabilities_score['online_media_pp_score'] = calculate_factor_reliability(online_media_pp_vars, ml_df['online_media_pp_score'])
reliabilities_score['voting_score'] = calculate_factor_reliability(voting_vars, ml_df['voting_score'])
reliabilities_score['offline_media_pp_score'] = calculate_factor_reliability(offline_media_pp_vars, ml_df['offline_media_pp_score'])
reliabilities_score['campaign_worker_pp_score'] = calculate_factor_reliability(campaign_worker_pp_vars, ml_df['campaign_worker_pp_score'])

reliabilities_mean = pd.Series(index=['online_media_pp', 'voting', 'offline_media_pp', 'campaign_worker_pp'])
reliabilities_mean['online_media_pp'] = calculate_factor_reliability(online_media_pp_vars, ml_df['online_media_pp'])
reliabilities_mean['voting'] = calculate_factor_reliability(voting_vars, ml_df['voting'])
reliabilities_mean['offline_media_pp'] = calculate_factor_reliability(offline_media_pp_vars, ml_df['offline_media_pp'])
reliabilities_mean['campaign_worker_pp'] = calculate_factor_reliability(campaign_worker_pp_vars, ml_df['campaign_worker_pp'])

reliabilities_score, reliabilities_mean

兩種建立指標的方式得出Cronbach's Alpha都接近1，因此指標具有內在信度

## Modleing

In [None]:
ml_df.columns

### DV ~ anti_party

In [None]:
# Define the dependent and independent variables
dependent_variables = ['online_media_pp', 'voting', 'offline_media_pp', 'campaign_worker_pp']
independent_variables = ['sex', 'C(ethnic, Treatment(reference="臺灣人"))', 'edu', 'income', 'political_knowledge', 'TC_issue', 'anti_party', 'C(Negative_1, Treatment(reference="沒有影響"))', 'C(Negative_2, Treatment(reference="沒有影響"))', 'C(Negative_3, Treatment(reference="沒有影響"))']

# Create the models and print the summaries
for dep_var in dependent_variables:
    formula = dep_var + ' ~ ' + ' + '.join(independent_variables)
    model = smf.ols(formula, data=ml_df).fit()
    print(model.summary())

### DV ~ anti_1 + anti_2 + anti_3 + anti_4 + anti_5

In [None]:
# Define the dependent and independent variables
dependent_variables = ['online_media_pp', 'voting', 'offline_media_pp', 'campaign_worker_pp']
independent_variables = ['sex', 'C(ethnic, Treatment(reference="臺灣人"))', 'edu', 'income', 'political_knowledge', 'TC_issue', 'anti_1', 'anti_2', 'anti_3', 'anti_4', 'anti_5', 'C(Negative_1, Treatment(reference="沒有影響"))', 'C(Negative_2, Treatment(reference="沒有影響"))', 'C(Negative_3, Treatment(reference="沒有影響"))']

# Create the models and print the summaries
for dep_var in dependent_variables:
    formula = dep_var + ' ~ ' + ' + '.join(independent_variables)
    model = smf.ols(formula, data=ml_df).fit()
    print(model.summary())