In [7]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from dateutil.relativedelta import relativedelta
import os
import glob
import numpy as np

In [5]:


def extract_iters(ep):
    file_pattern = f'Results/EP{ep}/EP{ep}_Ideal_points_emIRT_Iteration*.csv'

    # Use glob to list all files matching the pattern
    file_list = glob.glob(file_pattern)

    # Initialize an empty list to hold the DataFrames
    df_list = []

    for file in file_list:
        df = pd.read_csv(file)
        column = df[f'EPG....EPG{ep}']
        names = df[f'MepId....names{ep}']
        df = df['d1']

        df_list.append(df)

    combined_df = pd.concat(df_list, axis=0)

    # Calculate the average across the DataFrames
    average_df = combined_df.groupby(combined_df.index).mean()
    column = column.reset_index(drop=True)
    names = names.reset_index(drop=True)
    average_df = average_df.reset_index(drop=True)

    final_df = pd.concat([names, column, average_df], axis=1)

    final_df.columns = ['MepId', 'EPG', 'Average_position']
    return final_df


avg6 = extract_iters(6)
avg7 = extract_iters(7)
avg8 = extract_iters(8)
avg9 = extract_iters(9)
mepinfo6 = pd.read_csv(os.path.join('Cleaned_data', 'EP6_clean_data', 'mep_info_EP_6.csv'))
mepinfo7 = pd.read_csv(os.path.join('Cleaned_data', 'EP7_clean_data', 'mep_info_EP_7.csv'))
mepinfo8 = pd.read_csv(os.path.join('Cleaned_data', 'EP8_clean_data', 'mep_info_EP_8.csv'))
avg7 = avg7.rename(columns={'MepId': 'FullName'})
avg8 = avg8.rename(columns={'MepId': 'FullName'})

avg7 = pd.merge(avg7, mepinfo7, on='FullName', how='left')
avg8 = pd.merge(avg8, mepinfo8, on='FullName', how='left')


def load_vote_data(ep):
    df = pd.read_csv(os.path.join('Results', f'votes_plotly_{ep}.csv'))
    epg_counts = df['MepId'].value_counts()
    epg_4_votes = df[df['Vote'] == 4]['MepId'].value_counts()
    dataframe = pd.DataFrame()
    dataframe['MepId'] = df['MepId'].unique()
    proportions = epg_4_votes / epg_counts
    dataframe['Proportion'] = proportions
    return proportions.reset_index()


proportions6 = load_vote_data(6)
proportions7 = load_vote_data(7)
proportions8 = load_vote_data(8)
proportions9 = load_vote_data(9)
final6 = pd.merge(proportions6, avg6, on='MepId', how='left')
final7 = pd.merge(proportions7, avg7, on='MepId', how='left')
final8 = pd.merge(proportions8, avg8, on='MepId', how='left')
final9 = pd.merge(proportions9, avg9, on='MepId', how='left')



In [8]:
linear7 = final7[['count', 'Average_position', 'Country', 'EPG_x', 'Gender', 'Start', 'Birthday']]

# Convert categorical variables into dummy variables
linear7 = pd.get_dummies(linear7, columns=['Country', 'EPG_x', 'Gender'])

# Function to calculate age based on Start and Birthday columns
def get_age(row):
    date1 = pd.to_datetime(row['Start'])
    date2 = pd.to_datetime(row['Birthday'])
    age = relativedelta(date1, date2).years
    return age

# Apply the age calculation and add it as a new column
linear7['Age'] = linear7.apply(get_age, axis=1)

# Define the target variable y and drop unnecessary columns
y = linear7['count']
linear7.drop(['Start', 'Birthday', 'count'], axis=1, inplace=True)

# Ensure all data is numeric
linear7 = linear7.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# Drop or fill any NaN values (fill with zero for simplicity)
linear7.fillna(0, inplace=True)
y.fillna(0, inplace=True)

# Split the data into training and testing sets
train7, test7, ytrain7, ytest7 = train_test_split(linear7, y, test_size=0.2, random_state=42)

# Convert to numpy arrays to prevent dtype issues
train7 = np.asarray(train7, dtype=float)
test7 = np.asarray(test7, dtype=float)
ytrain7 = np.asarray(ytrain7, dtype=float)
ytest7 = np.asarray(ytest7, dtype=float)

# Step 2: Add constant term for intercept
train7 = sm.add_constant(train7)
test7 = sm.add_constant(test7)

# Step 3: Fit the OLS model with robust standard errors
# Fit the model on training data
model = sm.OLS(ytrain7, train7)
robust_model = model.fit(cov_type='HC3')  # Use HC3 for robust standard errors

# Step 4: Model Summary
# Print the summary to see coefficients and robust standard errors
print(robust_model.summary())

# Step 5: Predict on Test Set
# Make predictions on the test set
ypred = robust_model.predict(test7)

# Step 6: Evaluate the Model
# Calculate Mean Squared Error and R-squared for the test set
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(ytest7, ypred)
r2 = r2_score(ytest7, ypred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.119
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     46.56
Date:                Mon, 11 Nov 2024   Prob (F-statistic):          2.52e-158
Time:                        11:13:22   Log-Likelihood:                 604.69
No. Observations:                 682   AIC:                            -1133.
Df Residuals:                     644   BIC:                            -961.4
Df Model:                          37                                         
Covariance Type:                  HC3                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0921      0.014      6.584      0.0



In [9]:
robust_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.119
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,46.56
Date:,"Mon, 11 Nov 2024",Prob (F-statistic):,2.52e-158
Time:,11:14:17,Log-Likelihood:,604.69
No. Observations:,682,AIC:,-1133.0
Df Residuals:,644,BIC:,-961.4
Df Model:,37,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.0921,0.014,6.584,0.000,0.065,0.119
x1,0.0022,0.007,0.301,0.763,-0.012,0.016
x2,-0.0640,0.016,-3.984,0.000,-0.095,-0.033
x3,-0.0003,0.033,-0.008,0.993,-0.064,0.064
x4,-0.0084,0.021,-0.406,0.685,-0.049,0.032
x5,-0.0788,0.008,-9.353,0.000,-0.095,-0.062
x6,0.0439,0.037,1.186,0.236,-0.029,0.116
x7,0.0066,0.022,0.303,0.762,-0.036,0.049
x8,-0.0037,0.014,-0.262,0.793,-0.032,0.024

0,1,2,3
Omnibus:,220.743,Durbin-Watson:,1.896
Prob(Omnibus):,0.0,Jarque-Bera (JB):,779.718
Skew:,1.509,Prob(JB):,4.8599999999999996e-170
Kurtosis:,7.282,Cond. No.,9.74e+16
