In [None]:
import pandas as pd  # Importing pandas for data manipulation
import numpy as np  # Importing numpy for numerical operations


In [None]:
# Read the original Bundesliga 2021 data from a CSV file into a DataFrame
df = pd.read_csv('data/original_data/bundesliga_2021.csv')

# Define the columns to keep in the filtered DataFrame
columns_to_keep = ['Home Team', 'Away Team', 'Home Goals', 'Away Goals', 'Home Ast', 'Away Ast', 'Home xG', 'Away xG']

# Drop the row at index 233 (assuming it is erroneous or unnecessary)
df = df.drop(index=233)

# Filter the DataFrame to keep only the specified columns
filtered_df = df[columns_to_keep]

# Ensure the 'Away Goals' column is of integer type
filtered_df['Away Goals'] = filtered_df['Away Goals'].astype(np.int64)

# Save the cleaned DataFrame to a new CSV file
filtered_df.to_csv('data/cleaned_data/bundesliga_2021_cleaned.csv', index=False)

In [None]:
# Read the original Bundesliga 2022 data from a CSV file into a DataFrame
df = pd.read_csv('data/original_data/bundesliga_2022.csv')

# Filter the DataFrame to keep only the specified columns
filtered_df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
filtered_df.to_csv('data/cleaned_data/bundesliga_2022_cleaned.csv', index=False)

# Display the first few rows of the cleaned DataFrame
filtered_df.head()

In [None]:
# Read the original Bundesliga 2023 data from a CSV file into a DataFrame
df = pd.read_csv('data/original_data/bundesliga_2023.csv')

# Define the columns to keep in the filtered DataFrame
columns_to_keep = ['Home Team', 'Away Team', 'Home Goals', 'Away Goals', 'Home Ast', 'Away Ast', 'Home xG', 'Away xG']

# Filter the DataFrame to keep only the specified columns
filtered_df = df[columns_to_keep]

# Save the cleaned DataFrame to a new CSV file
filtered_df.to_csv('data/cleaned_data/bundesliga_2023_cleaned.csv', index=False)

# Display the first few rows of the cleaned DataFrame
filtered_df.head()

In [None]:
# Read the cleaned Bundesliga 2021-2023 data from a CSV file into a DataFrame
bundesliga_2021 = pd.read_csv('/Users/jihangli/ucsc_cse_course/CSE115A/Soccer-Match-Predictor/data/cleaned_data/bundesliga_2021_cleaned.csv')
bundesliga_2022 = pd.read_csv('/Users/jihangli/ucsc_cse_course/CSE115A/Soccer-Match-Predictor/data/cleaned_data/bundesliga_2022_cleaned.csv')
bundesliga_2023 = pd.read_csv('/Users/jihangli/ucsc_cse_course/CSE115A/Soccer-Match-Predictor/data/cleaned_data/bundesliga_2023_cleaned.csv')

# Combine the DataFrames for the three seasons into one DataFrame
combined_bundesliga = pd.concat([bundesliga_2021, bundesliga_2022, bundesliga_2023], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_bundesliga.to_csv('data/cleaned_data/bundesliga_cleaned.csv', index=False)

In [None]:
# FC Augsburg vs. SV Werder Bremen

# Read the match simulation results for Augsburg from a CSV file
match_one_Augsburg = pd.read_csv('/Users/jihangli/ucsc_cse_course/CSE115A/Soccer-Match-Predictor/Model_Jihang/result.csv')

# Calculate the mean goals for home and away teams
home_goal_mean = match_one_Augsburg['Home Goals Simulations'].mean()
away_goal_mean = match_one_Augsburg['Away Goals Simulations'].mean()

def  classify_result(row):
    """
    Classify the result of a match simulation.

    Args:
        row (pd.Series): A row of the DataFrame containing 'Home Goals Simulations' and 'Away Goals Simulations'.

    Returns:
        str: The outcome of the match simulation ('Home Win', 'Away Win', or 'Tie').
    """

    home_goals = row['Home Goals Simulations']
    away_goals = row['Away Goals Simulations']
    if home_goals - away_goals >= 0.5:
        return 'Home Win'
    elif away_goals - home_goals >= 0.5:
        return 'Away Win'
    else:
        return 'Tie'

# Apply the classification function to each row in the DataFrame
match_one_Augsburg['Outcome'] = match_one_Augsburg.apply(classify_result, axis=1)

# Calculate the total number of simulations and the number of each outcome
total_simulations = len(match_one_Augsburg)
home_wins = len(match_one_Augsburg[match_one_Augsburg['Outcome'] == 'Home Win'])
away_wins = len(match_one_Augsburg[match_one_Augsburg['Outcome'] == 'Away Win'])
ties = len(match_one_Augsburg[match_one_Augsburg['Outcome'] == 'Tie'])

# Calculate the percentage of each outcome
home_win_percentage = (home_wins / total_simulations) * 100
away_win_percentage = (away_wins / total_simulations) * 100
tie_percentage = (ties / total_simulations) * 100

# Create a DataFrame to display the percentages
percentages = pd.DataFrame({
    'Home Team': 'FC Augsburg',
    'Away Team': 'SV Werder Bremen',
    'Home Win': [home_win_percentage],
    'Away Win': [away_win_percentage],
    'Tie': tie_percentage,
    'Home xG': home_goal_mean,
    'Away xG': away_goal_mean
})
# Display the percentages DataFrame
percentages