In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

file_path = 'Sports Dataset.csv'
data = pd.read_csv(file_path)

missing_values = data.isnull().sum()
missing_values = missing_values[missing_values > 0]
missing_values_percentage = (missing_values / len(data)) * 100

missing_values_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_values_percentage
})

print(missing_values_df)

numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

knn_imputer = KNNImputer(n_neighbors=5)
data[numerical_columns] = knn_imputer.fit_transform(data[numerical_columns])

for column in categorical_columns:
    mode_value = data[column].mode()[0]
    data[column].fillna(mode_value, inplace=True)
print(data.isnull().sum())



                           Missing Values  Percentage
Height                               4625      23.125
Weight                               4761      23.805
Goals                                4760      23.800
Assists                              4745      23.725
PassCompletionRate                   2910      14.550
PressurePerformanceImpact            2107      10.535
EffectiveTraining                    3917      19.585


In [2]:
print(data.isnull().sum())

Unnamed: 0                   0
Player                       0
Team                         0
Age                          0
Height                       0
Weight                       0
Position                     0
Goals                        0
Assists                      0
YellowCards                  0
RedCards                     0
PassCompletionRate           0
DistanceCovered              0
Sprints                      0
ShotsOnTarget                0
TacklesWon                   0
CleanSheets                  0
PlayerFatigue                0
MatchPressure                0
InjuryHistory                0
TrainingHours                0
FatigueInjuryCorrelation     0
PressurePerformanceImpact    0
EffectiveTraining            0
Season                       0
dtype: int64


In [3]:
Q1 = data[numerical_columns].quantile(0.25)
Q3 = data[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

outliers = (data[numerical_columns] < (Q1 - 1.5 * IQR)) | (data[numerical_columns] > (Q3 + 1.5 * IQR))

outliers_combined = outliers.any(axis=1)

data_cleaned = data[~outliers_combined]

removed_outliers_count = outliers_combined.sum()
print(f"Removed {removed_outliers_count} outliers from the dataset.")

Removed 6314 outliers from the dataset.


In [5]:
def round_numerical_columns(df):
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].apply(lambda x: round(x, 2))
    return df

def standardize_categorical_columns(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.lower()  # Convert to lower case
    return df

def standardize_date_columns(df):
    date_columns = df.select_dtypes(include=['datetime64']).columns
    for col in date_columns:
        df[col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
    return df

data = round_numerical_columns(data)
data = standardize_categorical_columns(data)
data = standardize_date_columns(data)

In [7]:
import pandas as pd
import numpy as np

def augment_data(df, num_samples):
    synthetic_data = df.sample(n=num_samples, replace=True).reset_index(drop=True)
    
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_columns:
        noise = np.random.normal(0, 0.01, size=synthetic_data[col].shape)  
        synthetic_data[col] = synthetic_data[col] + noise
        synthetic_data[col] = synthetic_data[col].apply(lambda x: round(x, 2))  
    augmented_data = pd.concat([df, synthetic_data], ignore_index=True)
    
    return augmented_data

num_synthetic_samples = 1000  # Adjust as needed

augmented_data = augment_data(data, num_synthetic_samples)


In [9]:
cleaned_file_path = 'Sports_Dataset_Cleaned2.csv'
data_cleaned.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to {cleaned_file_path}")

Cleaned data saved to Sports_Dataset_Cleaned2.csv


In [11]:
import pandas as pd
import numpy as np
from scipy import stats

data = pd.read_csv('Sports_Dataset_Cleaned2.csv')

def validate_missing_values(df):
    missing_values = df.isnull().sum()
    print("Missing values in each column:")
    print(missing_values)
    if missing_values.sum() == 0:
        print("No missing values found.")
    else:
        print("Missing values found. Please handle them.")

def validate_outliers(df, columns_to_check):
    outliers = {}
    for column in columns_to_check:
        z_scores = stats.zscore(df[column])
        abs_z_scores = np.abs(z_scores)
        outliers[column] = np.where(abs_z_scores > 3)
    
    print("Outliers in each column:")
    for column, outlier_indices in outliers.items():
        if len(outlier_indices[0]) > 0:
            print(f"{column}: {len(outlier_indices[0])} outliers found.")
        else:
            print(f"{column}: No outliers found.")

def validate_data_formats(df):
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_columns:
        if not all(df[col] == df[col].round(2)):
            print(f"Column {col} is not properly rounded to 2 decimal places.")
        else:
            print(f"Column {col} is properly rounded to 2 decimal places.")
    
    categorical_columns = df.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        if not all(df[col] == df[col].str.lower()):
            print(f"Column {col} is not properly formatted to lower case.")
        else:
            print(f"Column {col} is properly formatted to lower case.")
    
    date_columns = df.select_dtypes(include=['datetime64', 'datetime']).columns
    for col in date_columns:
        try:
            pd.to_datetime(df[col], format='%Y-%m-%d')
            print(f"Column {col} is properly formatted as 'YYYY-MM-DD'.")
        except ValueError:
            print(f"Column {col} is not properly formatted as 'YYYY-MM-DD'.")

columns_to_check = ['Goals', 'Assists', 'YellowCards', 'RedCards', 'PassCompletionRate', 
                    'DistanceCovered', 'Sprints', 'ShotsOnTarget', 'TacklesWon']

validate_missing_values(data)
validate_outliers(data, columns_to_check)
validate_data_formats(data)


Missing values in each column:
Unnamed: 0                   0
Player                       0
Team                         0
Age                          0
Height                       0
Weight                       0
Position                     0
Goals                        0
Assists                      0
YellowCards                  0
RedCards                     0
PassCompletionRate           0
DistanceCovered              0
Sprints                      0
ShotsOnTarget                0
TacklesWon                   0
CleanSheets                  0
PlayerFatigue                0
MatchPressure                0
InjuryHistory                0
TrainingHours                0
FatigueInjuryCorrelation     0
PressurePerformanceImpact    0
EffectiveTraining            0
Season                       0
dtype: int64
No missing values found.
Outliers in each column:
Goals: 160 outliers found.
Assists: No outliers found.
YellowCards: No outliers found.
RedCards: No outliers found.
PassCompletionR