In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

In [5]:
df5 = pd.read_csv('df5.csv')

In [6]:
print(df5.head()) 

   Year      Country  Total_Medals  Host_or_Not Country_Code   GDP_PCA  \
0  2008  Afghanistan             1            0          AFG -0.484561   
1  2012  Afghanistan             1            0          AFG -0.478765   
2  1992      Algeria             2            0          ALG -0.434253   
3  1996      Algeria             3            0          ALG -0.443431   
4  2000      Algeria             5            0          ALG -0.437493   

   population_PCA  income_PCA  athletes_PCA  population_growth_zscore  \
0       -0.286233   -1.237388     -1.416809                  0.932791   
1       -0.262288   -1.216370     -1.382949                  2.676930   
2       -0.260524   -1.122219     -0.982663                  1.202921   
3       -0.250034   -1.144281     -0.904040                  0.731920   
4       -0.243021   -1.134961     -0.745505                  0.428501   

   GDP_growth_zscore  HDI_zscore  Region_Europe and Central Asia  \
0           0.334188   -2.382543                

## ML Model 
### Y Normalized

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Assuming df3 is your DataFrame containing all data

# Z-score normalization of the target variable
df5['Total_Medals_Z'] = zscore(df5['Total_Medals'])

# Define train and test data based on the year
train_data = df5[(df5['Year'] >= 1992) & (df5['Year'] <= 2016)]
test_data = df5[df5['Year'] == 2020]

# Define features and target variable for training
X_train = train_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])
y_train = train_data['Total_Medals_Z']

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Define features for testing
X_test = test_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])

# Make predictions
test_predictions_z = model.predict(X_test)

# Convert predictions back to original scale using the mean and std from the training data
mean_total_medals = train_data['Total_Medals'].mean()
std_total_medals = train_data['Total_Medals'].std()
predictions = mean_total_medals + (test_predictions_z * std_total_medals)

# Ensure predictions are non-negative integers
predictions = np.maximum(predictions, 0)  # Set minimum value to 0
predictions = predictions.astype(int)      # Convert to integers

# Add predictions to the test_data DataFrame
test_data['Predicted_Medals'] = predictions

# Select relevant columns for final output
final_output = test_data[['Country', 'Predicted_Medals', 'Total_Medals']]
print(final_output)

# Calculate R-squared and Adjusted R-squared for the test set
y_test_actual = test_data['Total_Medals']
r_squared_test = r2_score(y_test_actual, predictions)
adjusted_r_squared_test = 1 - (1 - r_squared_test) * (len(y_test_actual) - 1) / (len(y_test_actual) - X_test.shape[1] - 1)

# Calculate Standard Error for the test set
standard_error_test = np.sqrt(mean_squared_error(y_test_actual, predictions))

# Calculate MSE for the training set
train_predictions = model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)

# Calculate R-squared for the training set
train_r_squared = r2_score(y_train, train_predictions)

# Print performance metrics
print(f'Test Set Multiple R: {np.sqrt(r_squared_test)}')  # R is the square root of R²
print(f'Test Set R-squared (R²): {r_squared_test}')
print(f'Test Set Adjusted R-squared: {adjusted_r_squared_test}')
print(f'Test Set Standard Error: {standard_error_test}')
print(f'Training Set Mean Squared Error (MSE): {train_mse}')

            Country  Predicted_Medals  Total_Medals
535   United States               112           113
536           China                91            88
537           Japan                54            58
538  United kingdom                35            65
539          Russia                35            71
..              ...               ...           ...
621           Ghana                 0             1
622         Grenada                 0             1
623          Kuwait                 0             1
624         Moldova                 0             1
625           Syria                 0             1

[91 rows x 3 columns]
Test Set Multiple R: 0.9283165192587096
Test Set R-squared (R²): 0.8617715599286062
Test Set Adjusted R-squared: 0.8363084262312441
Test Set Standard Error: 7.118309467164916
Training Set Mean Squared Error (MSE): 0.16008320889128913


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_Medals'] = predictions


### Random Forset

In [15]:
# Z-score normalization of the target variable
df5['Total_Medals_Z'] = zscore(df5['Total_Medals'])

# Define train and test data based on the year
train_data = df5[(df5['Year'] >= 1992) & (df5['Year'] <= 2016)]
test_data = df5[df5['Year'] == 2020]

# Define features and target variable for training
X_train = train_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])
y_train = train_data['Total_Medals_Z']

# Initialize the Linear Regression model
model = RandomForestRegressor(random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Define features for testing
X_test = test_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])

# Make predictions
test_predictions_z = model.predict(X_test)

# Convert predictions back to original scale using the mean and std from the training data
mean_total_medals = train_data['Total_Medals'].mean()
std_total_medals = train_data['Total_Medals'].std()
predictions = mean_total_medals + (test_predictions_z * std_total_medals)

# Ensure predictions are non-negative integers
predictions = np.maximum(predictions, 0)  # Set minimum value to 0
predictions = predictions.astype(int)      # Convert to integers

# Add predictions to the test_data DataFrame
test_data['Predicted_Medals'] = predictions

# Select relevant columns for final output
final_output = test_data[['Country', 'Predicted_Medals', 'Total_Medals']]
print(final_output)

# Calculate R-squared and Adjusted R-squared for the test set
y_test_actual = test_data['Total_Medals']
r_squared_test = r2_score(y_test_actual, predictions)
adjusted_r_squared_test = 1 - (1 - r_squared_test) * (len(y_test_actual) - 1) / (len(y_test_actual) - X_test.shape[1] - 1)

# Calculate Standard Error for the test set
standard_error_test = np.sqrt(mean_squared_error(y_test_actual, predictions))

# Calculate MSE for the training set
train_predictions = model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)

# Calculate R-squared for the training set
train_r_squared = r2_score(y_train, train_predictions)

# Print performance metrics
print(f'Test Set Multiple R: {np.sqrt(r_squared_test)}')  # R is the square root of R²
print(f'Test Set R-squared (R²): {r_squared_test}')
print(f'Test Set Adjusted R-squared: {adjusted_r_squared_test}')
print(f'Test Set Standard Error: {standard_error_test}')
print(f'Training Set Mean Squared Error (MSE): {train_mse}')

            Country  Predicted_Medals  Total_Medals
535   United States               111           113
536           China                73            88
537           Japan                79            58
538  United kingdom                42            65
539          Russia                44            71
..              ...               ...           ...
621           Ghana                 1             1
622         Grenada                 2             1
623          Kuwait                 2             1
624         Moldova                 2             1
625           Syria                 1             1

[91 rows x 3 columns]
Test Set Multiple R: 0.9467978128213723
Test Set R-squared (R²): 0.8964260983633343
Test Set Adjusted R-squared: 0.8773466954302643
Test Set Standard Error: 6.161739443293019
Training Set Mean Squared Error (MSE): 0.015145438380448616


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_Medals'] = predictions


### Lasso

In [16]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Z-score normalization of the target variable
df5['Total_Medals_Z'] = zscore(df5['Total_Medals'])

# Define train and test data based on the year
train_data = df5[(df5['Year'] >= 1992) & (df5['Year'] <= 2016)]
test_data = df5[df5['Year'] == 2020]

# Define features and target variable for training
X_train = train_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])
y_train = train_data['Total_Medals_Z']

# Initialize the Lasso Regression model with alpha parameter for regularization
lasso_model = Lasso(alpha=0.01, random_state=42)

# Fit the model
lasso_model.fit(X_train, y_train)

# Define features for testing
X_test = test_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])

# Make predictions
test_predictions_z = lasso_model.predict(X_test)

# Convert predictions back to original scale using the mean and std from the training data
mean_total_medals = train_data['Total_Medals'].mean()
std_total_medals = train_data['Total_Medals'].std()
predictions = mean_total_medals + (test_predictions_z * std_total_medals)

# Ensure predictions are non-negative integers
predictions = np.maximum(predictions, 0)  # Set minimum value to 0
predictions = predictions.astype(int)      # Convert to integers

# Add predictions to the test_data DataFrame
test_data['Predicted_Medals'] = predictions

# Select relevant columns for final output
final_output = test_data[['Country', 'Predicted_Medals', 'Total_Medals']]
print(final_output)

# Calculate R-squared and Adjusted R-squared for the test set
y_test_actual = test_data['Total_Medals']
r_squared_test = r2_score(y_test_actual, predictions)
adjusted_r_squared_test = 1 - (1 - r_squared_test) * (len(y_test_actual) - 1) / (len(y_test_actual) - X_test.shape[1] - 1)

# Calculate Standard Error for the test set
standard_error_test = np.sqrt(mean_squared_error(y_test_actual, predictions))

# Calculate MSE for the training set
train_predictions = lasso_model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)

# Calculate R-squared for the training set
train_r_squared = r2_score(y_train, train_predictions)

# Print performance metrics
print(f'Test Set Multiple R: {np.sqrt(r_squared_test)}')  # R is the square root of R²
print(f'Test Set R-squared (R²): {r_squared_test}')
print(f'Test Set Adjusted R-squared: {adjusted_r_squared_test}')
print(f'Test Set Standard Error: {standard_error_test}')
print(f'Training Set Mean Squared Error (MSE): {train_mse}')


            Country  Predicted_Medals  Total_Medals
535   United States               111           113
536           China                84            88
537           Japan                57            58
538  United kingdom                35            65
539          Russia                34            71
..              ...               ...           ...
621           Ghana                 0             1
622         Grenada                 0             1
623          Kuwait                 0             1
624         Moldova                 0             1
625           Syria                 0             1

[91 rows x 3 columns]
Test Set Multiple R: 0.9272825726634993
Test Set R-squared (R²): 0.8598529695654379
Test Set Adjusted R-squared: 0.8340364113274923
Test Set Standard Error: 7.167539771332027
Training Set Mean Squared Error (MSE): 0.1678757206168559


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_Medals'] = predictions


### Ridge

In [17]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Z-score normalization of the target variable
df5['Total_Medals_Z'] = zscore(df5['Total_Medals'])

# Define train and test data based on the year
train_data = df5[(df5['Year'] >= 1992) & (df5['Year'] <= 2016)]
test_data = df5[df5['Year'] == 2020]

# Define features and target variable for training
X_train = train_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])
y_train = train_data['Total_Medals_Z']

# Initialize the Ridge Regression model with alpha parameter for regularization
ridge_model = Ridge(alpha=1.0, random_state=42)

# Fit the model
ridge_model.fit(X_train, y_train)

# Define features for testing
X_test = test_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])

# Make predictions
test_predictions_z = ridge_model.predict(X_test)

# Convert predictions back to original scale using the mean and std from the training data
mean_total_medals = train_data['Total_Medals'].mean()
std_total_medals = train_data['Total_Medals'].std()
predictions = mean_total_medals + (test_predictions_z * std_total_medals)

# Ensure predictions are non-negative integers
predictions = np.maximum(predictions, 0)  # Set minimum value to 0
predictions = predictions.astype(int)      # Convert to integers

# Add predictions to the test_data DataFrame
test_data['Predicted_Medals'] = predictions

# Select relevant columns for final output
final_output = test_data[['Country', 'Predicted_Medals', 'Total_Medals']]
print(final_output)

# Calculate R-squared and Adjusted R-squared for the test set
y_test_actual = test_data['Total_Medals']
r_squared_test = r2_score(y_test_actual, predictions)
adjusted_r_squared_test = 1 - (1 - r_squared_test) * (len(y_test_actual) - 1) / (len(y_test_actual) - X_test.shape[1] - 1)

# Calculate Standard Error for the test set
standard_error_test = np.sqrt(mean_squared_error(y_test_actual, predictions))

# Calculate MSE for the training set
train_predictions = ridge_model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)

# Calculate R-squared for the training set
train_r_squared = r2_score(y_train, train_predictions)

# Print performance metrics
print(f'Test Set Multiple R: {np.sqrt(r_squared_test)}')  # R is the square root of R²
print(f'Test Set R-squared (R²): {r_squared_test}')
print(f'Test Set Adjusted R-squared: {adjusted_r_squared_test}')
print(f'Test Set Standard Error: {standard_error_test}')
print(f'Training Set Mean Squared Error (MSE): {train_mse}')


            Country  Predicted_Medals  Total_Medals
535   United States               112           113
536           China                90            88
537           Japan                54            58
538  United kingdom                35            65
539          Russia                35            71
..              ...               ...           ...
621           Ghana                 0             1
622         Grenada                 0             1
623          Kuwait                 0             1
624         Moldova                 0             1
625           Syria                 0             1

[91 rows x 3 columns]
Test Set Multiple R: 0.9277835344371
Test Set R-squared (R²): 0.8607822867725976
Test Set Adjusted R-squared: 0.8351369185464972
Test Set Standard Error: 7.143736209643175
Training Set Mean Squared Error (MSE): 0.1601939155730181


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_Medals'] = predictions


In [19]:
df5

Unnamed: 0,Year,Country,Total_Medals,Host_or_Not,Country_Code,GDP_PCA,population_PCA,income_PCA,athletes_PCA,population_growth_zscore,GDP_growth_zscore,HDI_zscore,Region_Europe and Central Asia,Region_Latin America and Caribbean,Region_Middle East and North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,Total_Medals_Z
0,2008,Afghanistan,1,0,AFG,-0.484561,-0.286233,-1.237388,-1.416809,0.932791,0.334188,-2.382543,0,0,0,0,1,0,-0.570710
1,2012,Afghanistan,1,0,AFG,-0.478765,-0.262288,-1.216370,-1.382949,2.676930,2.167613,-2.043928,0,0,0,0,1,0,-0.570710
2,1992,Algeria,2,0,ALG,-0.434253,-0.260524,-1.122219,-0.982663,1.202921,-0.107168,-1.253826,0,0,1,0,0,0,-0.517059
3,1996,Algeria,3,0,ALG,-0.443431,-0.250034,-1.144281,-0.904040,0.731920,0.370539,-1.095805,0,0,1,0,0,0,-0.463407
4,2000,Algeria,5,0,ALG,-0.437493,-0.243021,-1.134961,-0.745505,0.428501,0.308230,-0.824913,0,0,1,0,0,0,-0.356105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621,2020,Ghana,1,0,GHA,-0.448708,-0.259768,-1.095365,-1.338185,0.986527,-0.374281,-1.073231,0,0,0,0,0,1,-0.570710
622,2020,Grenada,1,0,GRD,-0.490925,-0.476145,-0.632423,-1.428286,-0.109432,-3.338263,0.190932,0,1,0,0,0,0,-0.570710
623,2020,Kuwait,1,0,KWT,-0.425783,-0.447548,0.969566,-1.383236,-2.290375,-1.576435,0.394101,0,0,1,0,0,0,-0.570710
624,2020,Moldova,1,0,MDA,-0.484508,-0.459193,-0.927172,-1.196867,-1.672838,-2.199939,0.115684,1,0,0,0,0,0,-0.570710


### Weighted LR

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Z-score normalization of the target variable
df5['Total_Medals_Z'] = zscore(df5['Total_Medals'])

# Define train and test data based on the year
train_data = df5[(df5['Year'] >= 1992) & (df5['Year'] <= 2016)]
test_data = df5[df5['Year'] == 2020]

# Define features and target variable for training
X_train = train_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])
y_train = train_data['Total_Medals_Z']

# Define weights (for example, you can use Total_Athletes as weights)
weights = train_data['Total_Medals']

# Initialize the Weighted Linear Regression model
weighted_lr_model = LinearRegression()

# Fit the model with sample weights
weighted_lr_model.fit(X_train, y_train, sample_weight=weights)

# Define features for testing
X_test = test_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])

# Make predictions
test_predictions_z = weighted_lr_model.predict(X_test)

# Convert predictions back to original scale using the mean and std from the training data
mean_total_medals = train_data['Total_Medals'].mean()
std_total_medals = train_data['Total_Medals'].std()
predictions = mean_total_medals + (test_predictions_z * std_total_medals)

# Ensure predictions are non-negative integers
predictions = np.maximum(predictions, 0)  # Set minimum value to 0
predictions = predictions.astype(int)      # Convert to integers

# Add predictions to the test_data DataFrame
test_data['Predicted_Medals'] = predictions

# Select relevant columns for final output
final_output = test_data[['Country', 'Predicted_Medals', 'Total_Medals']]
print(final_output)

# Calculate R-squared and Adjusted R-squared for the test set
y_test_actual = test_data['Total_Medals']
r_squared_test = r2_score(y_test_actual, predictions)
adjusted_r_squared_test = 1 - (1 - r_squared_test) * (len(y_test_actual) - 1) / (len(y_test_actual) - X_test.shape[1] - 1)

# Calculate Standard Error for the test set
standard_error_test = np.sqrt(mean_squared_error(y_test_actual, predictions))

# Calculate MSE for the training set
train_predictions = weighted_lr_model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)

# Calculate R-squared for the training set
train_r_squared = r2_score(y_train, train_predictions)

# Print performance metrics
print(f'Test Set Multiple R: {np.sqrt(r_squared_test)}')  # R is the square root of R²
print(f'Test Set R-squared (R²): {r_squared_test}')
print(f'Test Set Adjusted R-squared: {adjusted_r_squared_test}')
print(f'Test Set Standard Error: {standard_error_test}')
print(f'Training Set Mean Squared Error (MSE): {train_mse}')


            Country  Predicted_Medals  Total_Medals
535   United States               115           113
536           China                92            88
537           Japan                52            58
538  United kingdom                36            65
539          Russia                44            71
..              ...               ...           ...
621           Ghana                 0             1
622         Grenada                 0             1
623          Kuwait                 0             1
624         Moldova                 0             1
625           Syria                 0             1

[91 rows x 3 columns]
Test Set Multiple R: 0.9195724178735452
Test Set R-squared (R²): 0.8456134317137979
Test Set Adjusted R-squared: 0.817173800713708
Test Set Standard Error: 7.522858937492221
Training Set Mean Squared Error (MSE): 0.2571602890976294


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted_Medals'] = predictions


In [29]:
final_Output_RF=final_output.copy()

In [30]:


# Assuming final_output is already defined
final_Output_RF = test_data[['Country', 'Predicted_Medals', 'Total_Medals']]

# Define the file path
file_path = r'C:\Users\Asus\Documents\Summer Olympics\final_Output_RF.xlsx'

# Save the DataFrame to an Excel file
final_Output_RF.to_excel(file_path, index=False)

print(f"Data saved to {file_path}")


Data saved to C:\Users\Asus\Documents\Summer Olympics\final_Output_RF.xlsx


### Poisson regression

In [25]:
import statsmodels.api as sm
import pandas as pd
import numpy as np

# Z-score normalization of the target variable
df5['Total_Medals_Z'] = zscore(df5['Total_Medals'])

# Define train and test data based on the year
train_data = df5[(df5['Year'] >= 1992) & (df5['Year'] <= 2016)]
test_data = df5[df5['Year'] == 2020]

# Define features for training
X_train = train_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])
X_train = sm.add_constant(X_train)  # Adds a constant term for the intercept
y_train = train_data['Total_Medals']

# Fit the Poisson regression model
poisson_model = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()

# Define features for testing
X_test = test_data.drop(columns=['Total_Medals', 'Total_Medals_Z', 'Year', 'Country', 'Country_Code'])
X_test = sm.add_constant(X_test)

# Make predictions
test_predictions = poisson_model.predict(X_test)

# Ensure predictions are non-negative integers
predictions = np.maximum(test_predictions, 0)  # Set minimum value to 0
predictions = np.round(predictions).astype(int)  # Convert to integers

# Add predictions to the test_data DataFrame
test_data['Predicted_Medals'] = predictions

# Select relevant columns for final output
final_output = test_data[['Country', 'Predicted_Medals', 'Total_Medals']]
print(final_output)

# Calculate R-squared and Adjusted R-squared for the test set
y_test_actual = test_data['Total_Medals']
r_squared_test = 1 - (np.sum((y_test_actual - predictions) ** 2) / np.sum((y_test_actual - np.mean(y_test_actual)) ** 2))
adjusted_r_squared_test = 1 - (1 - r_squared_test) * (len(y_test_actual) - 1) / (len(y_test_actual) - X_test.shape[1] - 1)

# Calculate Standard Error for the test set
standard_error_test = np.sqrt(mean_squared_error(y_test_actual, predictions))

# Calculate MSE for the training set
train_predictions = poisson_model.predict(X_train)
train_mse = mean_squared_error(y_train, train_predictions)

# Calculate R-squared for the training set
train_r_squared = 1 - (np.sum((y_train - train_predictions) ** 2) / np.sum((y_train - np.mean(y_train)) ** 2))

# Print performance metrics
print(f'Test Set Multiple R: {np.sqrt(r_squared_test)}')  # R is the square root of R²
print(f'Test Set R-squared (R²): {r_squared_test}')
print(f'Test Set Adjusted R-squared: {adjusted_r_squared_test}')
print(f'Test Set Standard Error: {standard_error_test}')
print(f'Training Set Mean Squared Error (MSE): {train_mse}')


  endog_mu = self._clean(endog / mu)
  return 1. / (self.link.deriv(mu)**2 * self.variance(mu))


ValueError: NaN, inf or invalid value detected in weights, estimation infeasible.