In [4]:
import pandas as pd
df=pd.read_csv("C:\\Users\\madhu\\OneDrive\\Desktop\\Sports_analysis_capstone\\new_dataset.csv")

def data_validation(df):
    validation_results = []

    # Check for missing values
    missing_values = df.isnull().sum()
    missing_values_percentage = (missing_values / len(df)) * 100
    validation_results.append({
        'Validation': 'Missing Values',
        'Details': f'{missing_values.sum()} missing values ({missing_values_percentage.sum():.2f}% of total data)'
    })
    
    # Check for duplicates
    duplicates = df.duplicated().sum()
    validation_results.append({
        'Validation': 'Duplicates',
        'Details': f'{duplicates} duplicate rows'
    })

    # Check for outliers using IQR method
    outlier_results = {}
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))).sum()
        outlier_results[column] = outliers

    validation_results.append({
        'Validation': 'Outliers',
        'Details': outlier_results
    })

    # Check column types
    column_types = df.dtypes.to_dict()
    validation_results.append({
        'Validation': 'Column Types',
        'Details': column_types
    })

    # Check age range
    if 'Age' in df.columns:
        age_check = df['Age'].between(0, 120).all()
        age_check_result = "Yes, the age column values lie between 0 and 120" if age_check else "No, some age column values do not lie between 0 and 120"
    else:
        age_check_result = "Age column is not present in the DataFrame"

    validation_results.append({
        'Validation': 'Age Range',
        'Details': age_check_result
    })

    # Check for unique values in each column
    unique_values = {column: df[column].nunique() for column in df.columns}
    validation_results.append({
        'Validation': 'Unique Values',
        'Details': unique_values
    })

    # Save validation results to a CSV file
    validation_df = pd.DataFrame(validation_results)
    validation_df.to_csv("DataValidated_dataset.csv", index=False)
    
    return validation_df



# Perform data validation
validation_results = data_validation(df)
print(validation_results)


       Validation                                            Details
0  Missing Values             0 missing values (0.00% of total data)
1      Duplicates                                   0 duplicate rows
2        Outliers  {'Played_Id': 0, 'Age': 0, 'Assists': 0, 'Yell...
3    Column Types  {'Played_Id': int64, 'Player': object, 'Team':...
4       Age Range   Yes, the age column values lie between 0 and 120
5   Unique Values  {'Played_Id': 5000, 'Player': 4, 'Team': 3, 'A...
