In [5]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from scipy import stats
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the dataset
file_path = r'C:\Users\Kasper\OneDrive - Aarhus universitet\Dokumenter\Kandidat\2. Semester\Data Science Project\Data science project part 2\Football_data.csv'
football_data = pd.read_csv(file_path)

# Define independent variables (X) and dependent variable (y)
X = football_data.drop(columns=['FTR', 'Date', 'Day', 'Time', 'Home', 'Away', 'Season', 'Wk'])
y = football_data['FTR'].astype('category').cat.codes

# Add constant
X = add_constant(X, prepend=False)

# Standardize the data
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Function to calculate VIF
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Iteratively remove features with high VIF
def remove_high_vif(X, threshold=10):
    vif_data = calculate_vif(X)
    print(vif_data)
    while vif_data['VIF'].max() > threshold:
        feature_to_remove = vif_data.sort_values('VIF', ascending=False)['feature'].iloc[0]
        print(f"Removing {feature_to_remove} with VIF={vif_data['VIF'].max()}")
        X = X.drop(columns=[feature_to_remove])
        vif_data = calculate_vif(X)
        print(vif_data)
    return X

# Remove features with high VIF
X_scaled_reduced = remove_high_vif(X_scaled)

# Fit the full model with the reduced dataset
model_full = sm.MNLogit(y, X_scaled_reduced)
result_full = model_full.fit(method='lbfgs', maxiter=1000, disp=False)

# Fit the reduced model with one category removed (e.g., remove 'A')
y_reduced = y[y != 2]  # Assuming 'A' is coded as 2
X_reduced = X_scaled_reduced.loc[y_reduced.index]
model_reduced = sm.MNLogit(y_reduced, X_reduced)
result_reduced = model_reduced.fit(method='lbfgs', maxiter=1000, disp=False)

# Hausman-McFadden test
def hausman_mcfadden_test(result_full, result_reduced):
    b_full = result_full.params
    b_reduced = result_reduced.params
    cov_full = result_full.cov_params()
    cov_reduced = result_reduced.cov_params()
    
    # Align dimensions
    b_full = b_full.iloc[:, :-1]  # Remove the last column
    b_reduced = b_reduced.values.flatten()  # Flatten the reduced coefficients
    
    # Align covariance matrices
    cov_full = cov_full.iloc[:-1, :-1]  # Remove the last row and column
    cov_reduced = cov_reduced.values  # Convert to array
    
    # Calculate the difference in coefficients and covariance
    diff = b_full.values.flatten() - b_reduced
    cov_diff = cov_full.values - cov_reduced
    
    # Calculate the test statistic
    test_stat = diff.T @ np.linalg.inv(cov_diff) @ diff
    df = len(diff)
    
    p_value = 1 - stats.chi2.cdf(test_stat, df)
    
    return test_stat, p_value

# Perform the test
test_stat, p_value = hausman_mcfadden_test(result_full, result_reduced)
print(f"Hausman-McFadden Test Statistic: {test_stat}")
print(f"P-value: {p_value}")

# Interpretation
if p_value < 0.05:
    print("The IIA assumption is likely violated (reject null hypothesis).")
else:
    print("The IIA assumption is not violated (fail to reject null hypothesis).")


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)


                      feature         VIF
0                  Attendance    2.464000
1                       B365H    7.505543
2                       B365D    7.600829
3                       B365A   11.557399
4          AttackStrengthHome    8.264137
5          AttackStrengthAway    7.162130
6         DefenseWeaknessHome    6.324634
7         DefenseWeaknessAway    8.829224
8               AvgHomePoints         inf
9               AvgAwayPoints         inf
10        AvgLosingHomePoints    6.698050
11        AvgLosingAwayPoints    6.740975
12            AvgGoalDiffHome   13.290070
13            AvgGoalDiffAway   16.753331
14              HomeWinsRatio         inf
15             HomeDrawsRatio         inf
16              AwayWinsRatio         inf
17             AwayDrawsRatio         inf
18        AvgHomeCornersLast5    2.106094
19        AvgAwayCornersLast5    1.672472
20          AvgHomeShotsLast5    4.114385
21  AvgHomeShotsOnTargetLast5    3.636302
22          AvgAwayShotsLast5    3

  return 1 - self.ssr/self.uncentered_tss


                      feature         VIF
0                  Attendance    2.464000
1                       B365H    7.505543
2                       B365D    7.600829
3                       B365A   11.557399
4          AttackStrengthHome    8.264137
5          AttackStrengthAway    7.162130
6         DefenseWeaknessHome    6.324634
7         DefenseWeaknessAway    8.829224
8               AvgHomePoints   25.110633
9               AvgAwayPoints         inf
10        AvgLosingHomePoints    6.698050
11        AvgLosingAwayPoints    6.740975
12            AvgGoalDiffHome   13.290070
13            AvgGoalDiffAway   16.753331
14              HomeWinsRatio   21.046871
15              AwayWinsRatio         inf
16             AwayDrawsRatio         inf
17        AvgHomeCornersLast5    2.106094
18        AvgAwayCornersLast5    1.672472
19          AvgHomeShotsLast5    4.114385
20  AvgHomeShotsOnTargetLast5    3.636302
21          AvgAwayShotsLast5    3.464070
22  AvgAwayShotsOnTargetLast5    3

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


                      feature         VIF
0                  Attendance    2.464000
1                       B365H    7.505543
2                       B365D    7.600829
3                       B365A   11.557399
4          AttackStrengthHome    8.264137
5          AttackStrengthAway    7.162130
6         DefenseWeaknessHome    6.324634
7         DefenseWeaknessAway    8.829224
8               AvgHomePoints   25.110633
9               AvgAwayPoints    6.301852
10        AvgLosingHomePoints    6.698050
11        AvgLosingAwayPoints    6.740975
12            AvgGoalDiffHome   13.290070
13            AvgGoalDiffAway   16.753331
14              HomeWinsRatio   21.046871
15             AwayDrawsRatio    1.422245
16        AvgHomeCornersLast5    2.106094
17        AvgAwayCornersLast5    1.672472
18          AvgHomeShotsLast5    4.114385
19  AvgHomeShotsOnTargetLast5    3.636302
20          AvgAwayShotsLast5    3.464070
21  AvgAwayShotsOnTargetLast5    3.364547
22                        elo  105

  return 1 - self.ssr/self.uncentered_tss


                      feature         VIF
0                  Attendance    2.464000
1                       B365H    7.505543
2                       B365D    7.600829
3                       B365A   11.557399
4          AttackStrengthHome    8.264137
5          AttackStrengthAway    7.162130
6         DefenseWeaknessHome    6.324634
7         DefenseWeaknessAway    8.829224
8               AvgHomePoints   25.110633
9               AvgAwayPoints    6.301852
10        AvgLosingHomePoints    6.698050
11        AvgLosingAwayPoints    6.740975
12            AvgGoalDiffHome   13.290070
13            AvgGoalDiffAway   16.753331
14              HomeWinsRatio   21.046871
15             AwayDrawsRatio    1.422245
16        AvgHomeCornersLast5    2.106094
17        AvgAwayCornersLast5    1.672472
18          AvgHomeShotsLast5    4.114385
19  AvgHomeShotsOnTargetLast5    3.636302
20          AvgAwayShotsLast5    3.464070
21  AvgAwayShotsOnTargetLast5    3.364547
22                        elo  105

  return 1 - self.ssr/self.uncentered_tss


                      feature        VIF
0                  Attendance   2.463750
1                       B365H   7.360435
2                       B365D   7.496011
3                       B365A  11.556077
4          AttackStrengthHome   8.256924
5          AttackStrengthAway   7.154012
6         DefenseWeaknessHome   6.320230
7         DefenseWeaknessAway   8.829219
8               AvgHomePoints  25.110633
9               AvgAwayPoints   6.300877
10        AvgLosingHomePoints   6.696354
11        AvgLosingAwayPoints   6.735314
12            AvgGoalDiffHome  13.288537
13            AvgGoalDiffAway  16.734852
14              HomeWinsRatio  21.046256
15             AwayDrawsRatio   1.419602
16        AvgHomeCornersLast5   2.097566
17        AvgAwayCornersLast5   1.671206
18          AvgHomeShotsLast5   4.114136
19  AvgHomeShotsOnTargetLast5   3.634056
20          AvgAwayShotsLast5   3.463983
21  AvgAwayShotsOnTargetLast5   3.357305
22                        elo   6.465207
23              

  return 1 - self.ssr/self.uncentered_tss


                      feature        VIF
0                  Attendance   2.463318
1                       B365H   7.343503
2                       B365D   7.476593
3                       B365A  11.553180
4          AttackStrengthHome   8.032943
5          AttackStrengthAway   7.149855
6         DefenseWeaknessHome   6.224090
7         DefenseWeaknessAway   8.825847
8               AvgAwayPoints   6.104193
9         AvgLosingHomePoints   6.287247
10        AvgLosingAwayPoints   6.579581
11            AvgGoalDiffHome  12.485006
12            AvgGoalDiffAway  16.715698
13              HomeWinsRatio   5.809825
14             AwayDrawsRatio   1.419521
15        AvgHomeCornersLast5   2.097493
16        AvgAwayCornersLast5   1.670182
17          AvgHomeShotsLast5   4.114102
18  AvgHomeShotsOnTargetLast5   3.621076
19          AvgAwayShotsLast5   3.463913
20  AvgAwayShotsOnTargetLast5   3.356233
21                        elo   6.464957
22                   elo_away   5.208143
23              

  return 1 - self.ssr/self.uncentered_tss


                      feature        VIF
0                  Attendance   2.461845
1                       B365H   7.331241
2                       B365D   7.465117
3                       B365A  11.502551
4          AttackStrengthHome   7.891380
5          AttackStrengthAway   3.625575
6         DefenseWeaknessHome   6.072580
7         DefenseWeaknessAway   3.736342
8               AvgAwayPoints   5.356570
9         AvgLosingHomePoints   6.275258
10        AvgLosingAwayPoints   6.005693
11            AvgGoalDiffHome  12.261708
12              HomeWinsRatio   5.805699
13             AwayDrawsRatio   1.367501
14        AvgHomeCornersLast5   2.096589
15        AvgAwayCornersLast5   1.667401
16          AvgHomeShotsLast5   4.113916
17  AvgHomeShotsOnTargetLast5   3.620225
18          AvgAwayShotsLast5   3.460795
19  AvgAwayShotsOnTargetLast5   3.320405
20                        elo   6.431815
21                   elo_away   5.178582
22               FormHomeTeam   2.941292
23              

  return 1 - self.ssr/self.uncentered_tss


                      feature        VIF
0                  Attendance   2.455571
1                       B365H   7.330593
2                       B365D   7.458769
3                       B365A  11.500952
4          AttackStrengthHome   3.690747
5          AttackStrengthAway   3.614092
6         DefenseWeaknessHome   3.151549
7         DefenseWeaknessAway   3.729041
8               AvgAwayPoints   5.334191
9         AvgLosingHomePoints   5.933509
10        AvgLosingAwayPoints   5.864766
11              HomeWinsRatio   5.687682
12             AwayDrawsRatio   1.362632
13        AvgHomeCornersLast5   2.092755
14        AvgAwayCornersLast5   1.667385
15          AvgHomeShotsLast5   4.110119
16  AvgHomeShotsOnTargetLast5   3.598920
17          AvgAwayShotsLast5   3.459581
18  AvgAwayShotsOnTargetLast5   3.311371
19                        elo   6.410512
20                   elo_away   5.156637
21               FormHomeTeam   2.940221
22               FormAwayTeam   3.111334
23            Pr

  return 1 - self.ssr/self.uncentered_tss


                      feature       VIF
0                  Attendance  2.449854
1                       B365H  5.343129
2                       B365D  3.401169
3          AttackStrengthHome  3.689614
4          AttackStrengthAway  3.594006
5         DefenseWeaknessHome  3.130801
6         DefenseWeaknessAway  3.729030
7               AvgAwayPoints  5.325773
8         AvgLosingHomePoints  5.901690
9         AvgLosingAwayPoints  5.863546
10              HomeWinsRatio  5.687119
11             AwayDrawsRatio  1.362482
12        AvgHomeCornersLast5  2.088351
13        AvgAwayCornersLast5  1.667130
14          AvgHomeShotsLast5  4.103884
15  AvgHomeShotsOnTargetLast5  3.577543
16          AvgAwayShotsLast5  3.458977
17  AvgAwayShotsOnTargetLast5  3.277448
18                        elo  6.408785
19                   elo_away  5.030803
20               FormHomeTeam  2.914949
21               FormAwayTeam  3.104744
22            ProbabilityDraw  4.904364
23                      const       NaN


ValueError: need covariance of parameters for computing (unnormalized) covariances