# Code for the regression for the effect of key moments on the flow of momentum

In [None]:
#Import pitch control and PCV file
import pickle
import pandas as pd
import statsmodels.api as sm
import numpy as np
from progressbar import ProgressBar
infile = open('input PC regression key moments on Momentum.p','rb')
df_keys = pickle.load(infile)
infile.close()

In [None]:
#create three additional columns for binary values for the type of key moment
favorable = []
unfavorable = []
neutral = []
for i in range(len(df_keys)):
    if df_keys.iloc[i]['key_moment'] == 'favorable':
        favorable.append(1)
    else:
        favorable.append(0)
    if df_keys.iloc[i]['key_moment'] == 'unfavorable':
        unfavorable.append(1)
    else:
        unfavorable.append(0)        
    if df_keys.iloc[i]['key_moment'] == 'neutral':
        neutral.append(1)
    else:
        neutral.append(0)
df_keys['favorable'] = favorable
df_keys['unfavorable'] = unfavorable
df_keys['neutral'] = neutral
    

In [None]:
#normalize PCV (WPC) values
maximum = max(df_keys['WPC home 2min before'].max(), df_keys['WPC away 2min before'].max(), df_keys['WPC home 2min after'].max(), df_keys['WPC away 2min after'].max())
minimum = min(df_keys['WPC home 2min before'].min(), df_keys['WPC away 2min before'].min(), df_keys['WPC home 2min after'].min(), df_keys['WPC away 2min after'].min())
df_keys['WPC home 2min before'] = (df_keys['WPC home 2min before'] - minimum)/(maximum-minimum)
df_keys['WPC away 2min before'] = (df_keys['WPC away 2min before'] - minimum)/(maximum-minimum)
df_keys['WPC home 2min after'] = (df_keys['WPC home 2min after'] - minimum)/(maximum-minimum)
df_keys['WPC away 2min after'] = (df_keys['WPC away 2min after'] - minimum)/(maximum-minimum)

In [None]:
#select only relevant columns
df_keys = df_keys.dropna()
df = pd.DataFrame(df_keys,columns=['team', 'PC home 2min before','PC home 2min after', 'WPC home 2min before', 'WPC home 2min after', 'PC away 2min before','PC away 2min after', 'WPC away 2min before', 'WPC away 2min after', 'type_key', 'favorable', 'unfavorable', 'neutral', 'minute'])

In [None]:
#regression for pitch control and favorable key moments
p_valuepc = []
p_valuetype = []
pc_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate the regression over a hundred times
for i in pbar(range(100)):
    pc_before = []
    pc_after = []
    wpc_before = []
    wpc_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the pitch control values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            pc_before.append(df_new.iloc[j]['PC home 2min before'])
            pc_after.append(df_new.iloc[j]['PC home 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC home 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC home 2min after'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            pc_before.append(df_new.iloc[j]['PC away 2min before'])
            pc_after.append(df_new.iloc[j]['PC away 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC away 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC away 2min after'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'pc before': pc_before,
           'pc after': pc_after,
           'wpc before': wpc_before,
           'wpc after': wpc_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['pc before', 'favorable']]
    y = df_reg['pc after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuepc.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    pc_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)
    mse.append(model.mse_model)
data = {'p_value pc': p_valuepc,
       'p_value type': p_valuetype,
       'beta pc': pc_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}
#final dataframe with the statistics this regression
df_pc_favorable = pd.DataFrame(data)
     

In [None]:
#regression for pitch control and unfavorable key moments
p_valuepc = []
p_valuetype = []
pc_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate the regression over a hundred times
for i in pbar(range(100)):
    pc_before = []
    pc_after = []
    wpc_before = []
    wpc_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the pitch control values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            pc_before.append(df_new.iloc[j]['PC home 2min before'])
            pc_after.append(df_new.iloc[j]['PC home 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC home 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC home 2min after'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            pc_before.append(df_new.iloc[j]['PC away 2min before'])
            pc_after.append(df_new.iloc[j]['PC away 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC away 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC away 2min after'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'pc before': pc_before,
           'pc after': pc_after,
           'wpc before': wpc_before,
           'wpc after': wpc_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['pc before', 'unfavorable']]
    y = df_reg['pc after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuepc.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    pc_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)
    mse.append(model.mse_model)
data = {'p_value pc': p_valuepc,
       'p_value type': p_valuetype,
       'beta pc': pc_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}
#final dataframe with the statistics this regression
df_pc_unfavorable = pd.DataFrame(data)
     

In [None]:
#regression for pitch control and neutral key moments
p_valuepc = []
p_valuetype = []
pc_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate the regression over a hundred times
for i in pbar(range(100)):
    pc_before = []
    pc_after = []
    wpc_before = []
    wpc_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the pitch control values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            pc_before.append(df_new.iloc[j]['PC home 2min before'])
            pc_after.append(df_new.iloc[j]['PC home 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC home 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC home 2min after'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            pc_before.append(df_new.iloc[j]['PC away 2min before'])
            pc_after.append(df_new.iloc[j]['PC away 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC away 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC away 2min after'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'pc before': pc_before,
           'pc after': pc_after,
           'wpc before': wpc_before,
           'wpc after': wpc_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['pc before', 'neutral']]
    y = df_reg['pc after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuepc.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    pc_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)
    mse.append(model.mse_model)
data = {'p_value pc': p_valuepc,
       'p_value type': p_valuetype,
       'beta pc': pc_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}
#final dataframe with the statistics this regression
df_pc_neutral = pd.DataFrame(data)
     

In [None]:
#regression for PCV and favorable key moments
p_valuepc = []
p_valuetype = []
pc_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate the regression over a hundred times
for i in pbar(range(100)):
    pc_before = []
    pc_after = []
    wpc_before = []
    wpc_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the PCV values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            pc_before.append(df_new.iloc[j]['PC home 2min before'])
            pc_after.append(df_new.iloc[j]['PC home 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC home 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC home 2min after'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            pc_before.append(df_new.iloc[j]['PC away 2min before'])
            pc_after.append(df_new.iloc[j]['PC away 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC away 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC away 2min after'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'pc before': pc_before,
           'pc after': pc_after,
           'wpc before': wpc_before,
           'wpc after': wpc_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['wpc before', 'favorable']]
    y = df_reg['wpc after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuepc.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    pc_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)
    mse.append(model.mse_model)
data = {'p_value pc': p_valuepc,
       'p_value type': p_valuetype,
       'beta pc': pc_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}
#final dataframe with the statistics this regression
df_pcv_favorable = pd.DataFrame(data)
     

In [None]:
#regression for PCV and unfavorable key moments
p_valuepc = []
p_valuetype = []
pc_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate the regression over a hundred times
for i in pbar(range(100)):
    pc_before = []
    pc_after = []
    wpc_before = []
    wpc_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the PCV values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            pc_before.append(df_new.iloc[j]['PC home 2min before'])
            pc_after.append(df_new.iloc[j]['PC home 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC home 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC home 2min after'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            pc_before.append(df_new.iloc[j]['PC away 2min before'])
            pc_after.append(df_new.iloc[j]['PC away 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC away 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC away 2min after'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'pc before': pc_before,
           'pc after': pc_after,
           'wpc before': wpc_before,
           'wpc after': wpc_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['wpc before', 'unfavorable']]
    y = df_reg['wpc after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuepc.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    pc_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)
    mse.append(model.mse_model)
data = {'p_value pc': p_valuepc,
       'p_value type': p_valuetype,
       'beta pc': pc_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}
#final dataframe with the statistics this regression
df_pcv_unfavorable = pd.DataFrame(data)
     

In [None]:
#regression for PCV and neutral key moments
p_valuepc = []
p_valuetype = []
pc_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate the regression over a hundred times
for i in pbar(range(100)):
    pc_before = []
    pc_after = []
    wpc_before = []
    wpc_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the PCV values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            pc_before.append(df_new.iloc[j]['PC home 2min before'])
            pc_after.append(df_new.iloc[j]['PC home 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC home 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC home 2min after'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            pc_before.append(df_new.iloc[j]['PC away 2min before'])
            pc_after.append(df_new.iloc[j]['PC away 2min after'])
            wpc_before.append(df_new.iloc[j]['WPC away 2min before'])
            wpc_after.append(df_new.iloc[j]['WPC away 2min after'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'pc before': pc_before,
           'pc after': pc_after,
           'wpc before': wpc_before,
           'wpc after': wpc_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['wpc before', 'neutral']]
    y = df_reg['wpc after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuepc.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    pc_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)
    mse.append(model.mse_model)
data = {'p_value pc': p_valuepc,
       'p_value type': p_valuetype,
       'beta pc': pc_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}
#final dataframe with the statistics this regression
df_pcv_neutral = pd.DataFrame(data)
     

In [None]:
#Import xT file
import pickle
infile = open('input xT regression key moments on Momentum.p','rb')
df_keys = pickle.load(infile)
infile.close()

In [None]:
#create three additional columns for binary values for the type of key moment
favorable = []
unfavorable = []
neutral = []
for i in range(len(df_keys)):
    if df_keys.iloc[i]['key_moment'] == 'favorable':
        favorable.append(1)
    else:
        favorable.append(0)
    if df_keys.iloc[i]['key_moment'] == 'unfavorable':
        unfavorable.append(1)
    else:
        unfavorable.append(0)        
    if df_keys.iloc[i]['key_moment'] == 'neutral':
        neutral.append(1)
    else:
        neutral.append(0)
df_keys['favorable'] = favorable
df_keys['unfavorable'] = unfavorable
df_keys['neutral'] = neutral

In [None]:
#normalize the xT values
maximum = max(df_keys['xT_prev_2min_home'].max(), df_keys['xT_prev_2min_away'].max(), df_keys['xT_next_2min_home'].max(), df_keys['xT_next_2min_away'].max())
minimum = min(df_keys['xT_prev_2min_home'].min(), df_keys['xT_prev_2min_away'].min(), df_keys['xT_next_2min_home'].min(), df_keys['xT_next_2min_away'].min())
df_keys['xT_prev_2min_home'] = (df_keys['xT_prev_2min_home'] - minimum)/(maximum-minimum)
df_keys['xT_prev_2min_away'] = (df_keys['xT_prev_2min_away'] - minimum)/(maximum-minimum)
df_keys['xT_next_2min_home'] = (df_keys['xT_next_2min_home'] - minimum)/(maximum-minimum)
df_keys['xT_next_2min_away'] = (df_keys['xT_next_2min_away'] - minimum)/(maximum-minimum)

In [None]:
#only select the relevant columns in the dataframe
df_keys = df_keys.dropna()
df = pd.DataFrame(df_keys,columns=['team', 'xT_prev_2min_home','xT_next_2min_home', 'xT_prev_2min_away','xT_next_2min_away', 'type_key', 'favorable', 'unfavorable', 'neutral'])

In [None]:
#regression for xT and favorable key moments
p_valuext = []
p_valuetype = []
xt_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate regression over a hundred times
for i in pbar(range(100)):
    xt_before = []
    xt_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the xT values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            xt_before.append(df_new.iloc[j]['xT_prev_2min_home'])
            xt_after.append(df_new.iloc[j]['xT_next_2min_home'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            xt_before.append(df_new.iloc[j]['xT_next_2min_home'])
            xt_after.append(df_new.iloc[j]['xT_prev_2min_away'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'xt before': xt_before,
           'xt after': xt_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['xt before', 'favorable']]
    y = df_reg['xt after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuext.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    xt_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)    
    mse.append(model.mse_model)
data = {'p_value xt': p_valuext,
       'p_value type': p_valuetype,
       'beta xt': xt_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}

#final dataframe with the statistics this regression
df_xt_favorable = pd.DataFrame(data)
     

In [None]:
#regression for xT and unfavorable key moments
p_valuext = []
p_valuetype = []
xt_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate regression over a hundred times
for i in pbar(range(100)):
    xt_before = []
    xt_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the xT values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            xt_before.append(df_new.iloc[j]['xT_prev_2min_home'])
            xt_after.append(df_new.iloc[j]['xT_next_2min_home'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            xt_before.append(df_new.iloc[j]['xT_next_2min_home'])
            xt_after.append(df_new.iloc[j]['xT_prev_2min_away'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'xt before': xt_before,
           'xt after': xt_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['xt before', 'unfavorable']]
    y = df_reg['xt after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuext.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    xt_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)    
    mse.append(model.mse_model)
data = {'p_value xt': p_valuext,
       'p_value type': p_valuetype,
       'beta xt': xt_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}

#final dataframe with the statistics this regression
df_xt_unfavorable = pd.DataFrame(data)
     

In [None]:
#regression for xT and neutral key moments
p_valuext = []
p_valuetype = []
xt_param = []
type_param = []
rsq = []
mse = []
pbar = ProgressBar()
#iterate regression over a hundred times
for i in pbar(range(100)):
    xt_before = []
    xt_after = []
    favorable = []
    neutral = []
    unfavorable = []
    df_neutral = df.loc[df['neutral'] == 1]
    df_favorable = df.loc[df['favorable'] == 1]
    df_unfavorable = df.loc[df['unfavorable'] == 1]
    #take 80% random samples of the types of key moments to solve the imblanced key moments
    df_neutral = df_neutral.sample(frac=0.8, replace=True)
    df_favorable = df_favorable.sample(frac=0.8, replace=True)
    df_unfavorable = df_unfavorable.sample(frac=0.8, replace=True)
    df_new = pd.concat([df_neutral, df_favorable, df_unfavorable])
    for j in range(len(df_new)):
        #randomly take the xT values for either the home or away team
        number = np.random.randint(2)
        if number == 0:
            xt_before.append(df_new.iloc[j]['xT_prev_2min_home'])
            xt_after.append(df_new.iloc[j]['xT_next_2min_home'])
            favorable.append(df_new.iloc[j]['favorable'])
            unfavorable.append(df_new.iloc[j]['unfavorable'])
            neutral.append(df_new.iloc[j]['neutral'])
        else:
            xt_before.append(df_new.iloc[j]['xT_next_2min_home'])
            xt_after.append(df_new.iloc[j]['xT_prev_2min_away'])
            favorable.append(df_new.iloc[j]['unfavorable'])
            unfavorable.append(df_new.iloc[j]['favorable'])
            neutral.append(df_new.iloc[j]['neutral'])
    for j in range(len(df_new)):
        if df_new.iloc[j]['team'] == 'Neutral':
            favorable[j] = 0
            unfavorable[j] = 0
    #create dataframe with the necessary data for regression
    data = {'xt before': xt_before,
           'xt after': xt_after,
           'favorable': favorable,
           'unfavorable': unfavorable,
           'neutral': neutral}
    df_reg = pd.DataFrame(data)
    
    #Execute the regression
    X = df_reg[['xt before', 'neutral']]
    y = df_reg['xt after']

    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    #Create a dataframe with the important statistics
    p_valuext.append(model.pvalues[0])
    p_valuetype.append(model.pvalues[1])
    xt_param.append(model.params[0])
    type_param.append(model.params[1])
    rsq.append(model.rsquared_adj)    
    mse.append(model.mse_model)
data = {'p_value xt': p_valuext,
       'p_value type': p_valuetype,
       'beta xt': xt_param,
       'beta type': type_param,
       'rsquared': rsq,
       'MSE': mse}

#final dataframe with the statistics this regression
df_xt_neutral = pd.DataFrame(data)
     