!!! will have to recheck if all cells have proper logging !!!

In [4]:
# Import necessary libraries
import pandas as pd
import logging
import numpy as np

### Loading the Sampled Data

In [2]:
# Set up logging configuration
logging.basicConfig(filename='data_validation.log', 
                    level=logging.DEBUG, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Create a custom logger for this notebook
logger = logging.getLogger('DataValidation')

# Function to log exceptions
def log_exception(e):
    logger.exception("An error occurred: %s", str(e))

# Log the start of the data validation process
logger.info('Starting the data validation process.')

# Function to load and validate the data schema
def load_and_validate_data(file_path):
    """
    Loads the CSV file and prints out the data schema.

    :param file_path: Path to the CSV file.
    :return: Pandas DataFrame containing the data.
    """
    try:
        # Load the data
        logger.info('Loading data from %s', file_path)
        df = pd.read_csv(file_path)

        # Log and display the first few rows of the dataframe
        logger.info('Data loaded successfully. Displaying the first few rows.')
        display(df.head())
        
        # Log and display the schema (column names and data types)
        logger.info('Data schema: %s', df.dtypes)
        display(df.dtypes)

        return df
    except Exception as e:
        # Log any errors that occur during data loading
        log_exception(e)

In [3]:
# Call the function to load the data
file_path = '/Users/vallimeenaa/Desktop/Group 3 Project/Data Collection/sampled_2020_df.csv'
df = load_and_validate_data(file_path)

Unnamed: 0,review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year
0,2,1.0,B001DTE5Q2,B001DTE5Q2,0,Very flimsy hooks. Purses fall off constantly....,1581880140993,Waste of money,AFAHC6E2UT4DJT4E2GBGNE5C2LPQ,True,2020-02-16 14:09:00,Amazon Home,Perfect Curve Purse Rack HPC | Over The Door P...,"Home & Kitchen,Storage & Organization,Clothing...",,4.1,1973,2020
1,2,1.0,B003Z3URR0,B001G3ZP8W,1,This incense has no scent. Also does not burn ...,1582731248091,No scent incense.,AHB5U3ZMDUW7SKKDGVIKAOVCFY5Q,True,2020-02-26 10:34:08,Amazon Home,"Hem Precious Musk Fragrance Incense Sticks, 12...","Home & Kitchen,Home Décor Products,Home Fragra...",6.98,4.6,1229,2020
2,2,1.0,B00PILE4SK,B00FZ1JR6M,0,"This is a good brand, and for sure I'll buy it...",1582822571741,I paid for a new model and received a used one,AGIEN7AW4DV6OMUK25PCPFZAU6KQ,True,2020-02-27 11:56:11,Amazon Home,KitchenAid KSB1575BU 5-Speed Diamond Blender w...,"Home & Kitchen,Kitchen & Dining,Small Applianc...",,4.5,3905,2020
3,2,1.0,B00U8QGU32,B00Q7OFEM2,1,Was overall pretty happy with this comforter u...,1580683343441,Careful when washing,AGEWDSIDG4LO3ZM26G74F7MVKNHQ,True,2020-02-02 17:42:23,Amazon Home,Amazon Basics Reversible Lightweight Microfibe...,"Home & Kitchen,Bedding,Comforters & Sets",36.95,4.5,40403,2020
4,2,1.0,B078MPFN55,B078MPFN55,0,This product is a piece of junk. Since I firs...,1580923330717,Get A Roomba instead.,AHHKVMSOCT6J6Q3US2WDMSXK3TKQ,True,2020-02-05 12:22:10,Amazon Home,Neato Robotics Botvac D7 Connected Robot Vacuu...,"Home & Kitchen,Vacuums & Floor Care,Vacuums,Ro...",395.51,4.0,2821,2020


review_month               int64
rating                   float64
parent_asin               object
asin                      object
helpful_vote               int64
text                      object
timestamp                  int64
title                     object
user_id                   object
verified_purchase           bool
review_date_timestamp     object
main_category             object
product_name              object
categories                object
price                    float64
average_rating           float64
rating_number              int64
year                       int64
dtype: object

### General/Initial Validation Checks 
Duplicate records, valid data in the numerical columns, outliers in the numerical columns, incorrect data types

In [7]:
# Function to perform general validation checks
def validate_data_initial(df):
    """
    Performs extended data validation checks, including:
    - Ensuring numerical columns contain valid data.
    - Checking for outliers in numerical columns.
    - Checking for incorrect data types.
    
    :param df: Pandas DataFrame containing the data.
    :return: None
    """
    try:
        logger.info('Performing general validation checks.')

        # Check for invalid numerical values (e.g., negative prices)
        if 'price' in df.columns:
            invalid_prices = df[df['price'] < 0]
            if not invalid_prices.empty:
                logger.warning('Found invalid prices: %s', invalid_prices)
            else:
                logger.info('No invalid prices found.')
            display(invalid_prices)

        # Check for incorrect data types
        logger.info('Checking for incorrect data types.')

        # we need to expand/reduce this list to cover/remove any other columns that might be crucial for analysis or model training
        expected_types = {
            'review_month': 'int64',
            'rating': 'float64',
            'parent_asin': 'object',
            'asin': 'object',
            'helpful_vote': 'int64',  # assuming it's an integer
            'text': 'object',  # string
            'timestamp': 'int64',  # Unix timestamp; can also be datetime
            'title': 'object',  # string
            'user_id': 'object',  # string
            'verified_purchase': 'object',  # typically 'TRUE' or 'FALSE', can be bool or object
            'review_date_timestamp': 'datetime64[ns]',  # if parsing dates correctly
            'main_category': 'object',  # string
            'product_name': 'object',  # string
            'categories': 'object',  # string or list of strings
            'price': 'float64',
            'average_rating': 'float64',
            'rating_number': 'int64',  # assuming count
            'year': 'int64'
        }

        for col, expected_type in expected_types.items():
            if col in df.columns:
                if df[col].dtype != expected_type:
                    logger.warning(f'Column {col} has incorrect type: {df[col].dtype}. Expected: {expected_type}')
                    display(f'Incorrect type for column {col}: {df[col].dtype}')
                else:
                    logger.info(f'Column {col} has correct type: {expected_type}')

        # Convert 'timestamp' to datetime for easier analysis
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
            logger.info('Converted timestamp to datetime.')

        # Check for outliers in numerical columns (using IQR)
        numerical_columns = ['price', 'average_rating', 'rating']
        for col in numerical_columns:
            if col in df.columns:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
                
                if not outliers.empty:
                    logger.warning(f'Outliers detected in column {col}: %s', outliers)
                    display(f'Outliers in {col}', outliers)
                else:
                    logger.info(f'No outliers detected in column {col}.')
        
        logger.info('Extended data validation completed successfully.')

    except Exception as e:
        log_exception(e)

validate_data_initial(df)

# Review the logs in the data_validation.log file, which will include all validation steps, 
# issues found (like incorrect data types and outliers), and the handling of exceptions.

review_month                 0
rating                       0
parent_asin                  0
asin                         0
helpful_vote                 0
text                        30
timestamp                    0
title                       37
user_id                      0
verified_purchase            0
review_date_timestamp        0
main_category             2504
product_name                 3
categories                4476
price                    29973
average_rating               0
rating_number                0
year                         0
dtype: int64

np.int64(20)

Unnamed: 0,review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year


'Incorrect type for column timestamp: datetime64[ns]'

'Incorrect type for column verified_purchase: bool'

'Incorrect type for column review_date_timestamp: object'

'Outliers in price'

Unnamed: 0,review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year
4,2,1.0,B078MPFN55,B078MPFN55,0,This product is a piece of junk. Since I firs...,2020-02-05 17:22:10.717,Get A Roomba instead.,AHHKVMSOCT6J6Q3US2WDMSXK3TKQ,True,2020-02-05 12:22:10,Amazon Home,Neato Robotics Botvac D7 Connected Robot Vacuu...,"Home & Kitchen,Vacuums & Floor Care,Vacuums,Ro...",395.51,4.0,2821,2020
8,2,1.0,B08W3NLPL3,B076J1K34N,1,i'm not a very large guy and haven't been too ...,2020-02-10 01:53:37.756,4 months and it's already shot,AE234C7W4LUKNW3J5TPLKR3ICJEQ,True,2020-02-09 20:53:37,Amazon Home,"RESPAWN 200 Racing Style Gaming Chair, adjusta...","Home & Kitchen,Furniture,Game & Recreation Roo...",260.99,4.5,288,2020
11,2,1.0,B00YCH235C,B00YCH235C,5,Really? I paid a lot of money for a quality pr...,2020-02-29 21:22:43.082,poor quality.,AGEFMSSXR7OWRXR7G6QOQYOQLK3A,True,2020-02-29 16:22:43,Amazon Home,All-Clad D3 3-Ply Stainless Steel and Nonstick...,"Home & Kitchen,Kitchen & Dining,Cookware,Pots ...",159.95,4.5,466,2020
12,2,1.0,B07CBLGGW5,B076J9M6RW,0,Returned did not fit. Still looking for one th...,2020-02-17 20:20:40.710,Did not fit. Glue handle in the way.,AGDDE5QHZPF7BBMX7M7A4IQKASMQ,True,2020-02-17 15:20:40,Amazon Home,"ClassicFlame 26"" 3D Infrared Quartz Electric F...","Home & Kitchen,Heating, Cooling & Air Quality,...",249.99,4.6,1170,2020
20,2,1.0,B0BR2L3Z7P,B07S5R3HHV,24,First of all it’s extremely noisy! I read no n...,2020-02-28 17:29:32.669,Extremely Noisy!,AGT7A53OTPVV76DTLXS5YP7ZEZCQ,True,2020-02-28 12:29:32,Amazon Home,Meat Slicer 200W Electric Deli Food Slicer wit...,"Home & Kitchen,Kitchen & Dining,Small Applianc...",101.99,4.4,7714,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105162,6,2.0,B0BL2N9XB4,B01AC5F1SI,0,The plant does not look as real as. Had expect...,2020-06-04 21:29:34.336,Really artificial.,AEI6E2MXZAFAXPHWLL56ZPHTXOUQ,True,2020-06-04 17:29:34,Amazon Home,Pure Garden 50-10016 Giant Agave Fake Plant-52...,"Home & Kitchen,Home Décor Products,Artificial ...",119.95,4.7,120,2020
105163,6,2.0,B0787Y17P2,B07Q5XRMLD,1,Only had this 1 day and the heater and massage...,2020-06-20 15:00:04.355,Uhhh ohhh,AERZJXWQGHOAMI5ULG2WRBMXJY4A,True,2020-06-20 11:00:04,Amazon Home,MCombo Electric Power Lift Recliner Chair Sofa...,"Home & Kitchen,Furniture,Living Room Furniture...",599.90,4.5,3785,2020
105167,6,2.0,B01K5D5ALU,B008H4SLV6,4,"From the start, we just didn't see that this d...",2020-06-02 11:37:15.560,I would not buy this again,AES2RKZ4PJNVEHY5KRW2JJTRFNEA,False,2020-06-02 07:37:15,Amazon Home,"Vitamix 64 oz. Container, 5200 Blender, Profes...","Home & Kitchen,Kitchen & Dining,Small Applianc...",549.99,4.7,6741,2020
105168,6,2.0,B07CRN64ZR,B07CRN64ZR,0,The shelves are great but I did not receive th...,2020-06-16 00:03:38.660,Missing hardware,AGA63IBZXPOX6SINPVXYILMFVBFQ,True,2020-06-15 20:03:38,Amazon Home,"Wallniture Denver 46"" White Floating Shelves f...","Home & Kitchen,Home Décor Products,Home Décor ...",104.73,4.7,340,2020


'Outliers in average_rating'

Unnamed: 0,review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year
5,2,1.0,B07PQLLRSK,B01M5IYH11,0,This set looked really nice in the picture but...,2020-02-18 03:57:10.099,Don’t buy..bad quality!,AG535LRXQUGO6A2OIYEJQCJBGSSQ,True,2020-02-17 22:57:10,,Amrapur Overseas | Ella 24-Piece Pintuck Comfo...,"Home & Kitchen,Bedding,Comforters & Sets,Comfo...",,3.7,75,2020
6,2,1.0,B07YD9P72T,B07YD9P72T,1,This item is not stainless steel i got mine an...,2020-02-21 22:10:35.930,Dont buy its not stainless steel,AFAW2PDHERRWTRA7YTYPLXPFNJ2Q,True,2020-02-21 17:10:35,,"Dish Drying Rack, 2 Tier Stainless Steel Dish ...","Home & Kitchen,Kitchen & Dining,Storage & Orga...",,3.2,17,2020
13,2,1.0,B07FCWQ8V2,B07FCWQ8V2,1,The silverware was not cut smoothly so it's no...,2020-02-15 12:42:01.016,Looks cool but isn't,AFXMXR6YJYY63NT4QEFRANUS7PGQ,True,2020-02-15 07:42:01,Amazon Home,"Matte-Black-Silverware-Set, 45 Piece Stainless...","Home & Kitchen,Kitchen & Dining,Dining & Enter...",,3.4,757,2020
14,2,1.0,B07HD4SN93,B07HD4SN93,0,Lo compré pagué $20.00 tardo casi un mes la en...,2020-02-05 05:24:54.542,No trabaja,AGE52RVBIRQP7KOPBLKSN2OFHDQQ,False,2020-02-05 00:24:54,Tools & Home Improvement,Corner Storage Holder Shelves,"Home & Kitchen,Home Décor Products,Home Décor ...",,3.1,23,2020
15,2,1.0,B07T3QJH1G,B07T3QJH1G,0,"I never write reviews, however this clock is a...",2020-02-13 14:39:49.910,Terrible,AH4QCHZWT737SQBKXRXP6T2FGKVA,True,2020-02-13 09:39:49,Amazon Home,"PEMOTech Projection Alarm Clocks for Bedrooms,...","Home & Kitchen,Home Décor Products,Clocks,Spec...",,3.7,283,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105177,6,2.0,B0C5DY4HLP,B01CPAL5VK,0,This is literally tiny. Smaller than the chair...,2020-06-15 19:43:32.953,Very disappointing,AH5PPAM4L62X67SD5EJMXOWOXXXA,True,2020-06-15 15:43:32,Amazon Home,Furinno Turn-N-Tube No Tools 2-Tier Elevated T...,"Home & Kitchen,Furniture,Living Room Furniture...",18.96,3.6,10062,2020
105181,6,2.0,B07QNPLKZF,B07QNPLKZF,0,Very small and cheap! Pain to put together!,2020-06-06 12:28:03.924,Not worth it!,AFJPMNZ45LKHY2DKYQI5FRS65XKA,True,2020-06-06 08:28:03,Amazon Home,JS HOME 4-Tier Kitchen Storage Rack Wire Shelv...,"Home & Kitchen,Storage & Organization,Racks, S...",,3.6,159,2020
105193,6,2.0,B07S38NNC5,B07S38NNC5,0,Bought a refurbished one said it was fixed and...,2020-06-28 18:12:42.452,Big dent was happy to return it,AFZSF4ZLGYZJGTEPGZXUAKXF7O5A,True,2020-06-28 14:12:42,Amazon Home,Farberware 201362 4-Slice Waffle Maker One Siz...,"Home & Kitchen,Kitchen & Dining,Small Applianc...",,3.0,6,2020
105213,6,2.0,B07MV2N9BP,B07MM661CX,0,This looks really nice and works pretty well i...,2020-06-15 21:00:34.712,"Looks nice, breaks down quickly",AGG6FHTYFH5LLUDQDN55FBPM26OQ,True,2020-06-15 17:00:34,Amazon Home,Vornado HELIX2 Personal Tower Fan with 2 Speed...,"Home & Kitchen,Heating, Cooling & Air Quality,...",40.02,3.3,1185,2020


'Outliers in rating'

Unnamed: 0,review_month,rating,parent_asin,asin,helpful_vote,text,timestamp,title,user_id,verified_purchase,review_date_timestamp,main_category,product_name,categories,price,average_rating,rating_number,year
0,2,1.0,B001DTE5Q2,B001DTE5Q2,0,Very flimsy hooks. Purses fall off constantly....,2020-02-16 19:09:00.993,Waste of money,AFAHC6E2UT4DJT4E2GBGNE5C2LPQ,True,2020-02-16 14:09:00,Amazon Home,Perfect Curve Purse Rack HPC | Over The Door P...,"Home & Kitchen,Storage & Organization,Clothing...",,4.1,1973,2020
1,2,1.0,B003Z3URR0,B001G3ZP8W,1,This incense has no scent. Also does not burn ...,2020-02-26 15:34:08.091,No scent incense.,AHB5U3ZMDUW7SKKDGVIKAOVCFY5Q,True,2020-02-26 10:34:08,Amazon Home,"Hem Precious Musk Fragrance Incense Sticks, 12...","Home & Kitchen,Home Décor Products,Home Fragra...",6.98,4.6,1229,2020
2,2,1.0,B00PILE4SK,B00FZ1JR6M,0,"This is a good brand, and for sure I'll buy it...",2020-02-27 16:56:11.741,I paid for a new model and received a used one,AGIEN7AW4DV6OMUK25PCPFZAU6KQ,True,2020-02-27 11:56:11,Amazon Home,KitchenAid KSB1575BU 5-Speed Diamond Blender w...,"Home & Kitchen,Kitchen & Dining,Small Applianc...",,4.5,3905,2020
3,2,1.0,B00U8QGU32,B00Q7OFEM2,1,Was overall pretty happy with this comforter u...,2020-02-02 22:42:23.441,Careful when washing,AGEWDSIDG4LO3ZM26G74F7MVKNHQ,True,2020-02-02 17:42:23,Amazon Home,Amazon Basics Reversible Lightweight Microfibe...,"Home & Kitchen,Bedding,Comforters & Sets",36.95,4.5,40403,2020
4,2,1.0,B078MPFN55,B078MPFN55,0,This product is a piece of junk. Since I firs...,2020-02-05 17:22:10.717,Get A Roomba instead.,AHHKVMSOCT6J6Q3US2WDMSXK3TKQ,True,2020-02-05 12:22:10,Amazon Home,Neato Robotics Botvac D7 Connected Robot Vacuu...,"Home & Kitchen,Vacuums & Floor Care,Vacuums,Ro...",395.51,4.0,2821,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105228,6,2.0,B01ITE7948,B01ITE7948,0,I didn't keep this around long. After only a c...,2020-06-04 14:43:16.099,Curls up on the edges and becomes a trip hazard,AGFI6ESPJZMQI5ZN6YIIUHUO7VEQ,True,2020-06-04 10:43:16,Amazon Home,casa pura Kitchen Mat | Anti-Fatigue Standing ...,"Home & Kitchen,Kitchen & Dining,Kitchen Utensi...",,4.6,273,2020
105229,6,2.0,B09C9ZP156,B01FFYKQ5Y,0,Not at all like advertised. Doesn't chop. Nuts...,2020-06-25 16:54:29.132,Doesn't chop very well,AGVHU2LEJ466Z6WIAGGJLGJQWN2Q,True,2020-06-25 12:54:29,Amazon Home,Original Slap Chop Slicer with Stainless Steel...,"Home & Kitchen,Kitchen & Dining,Kitchen Utensi...",29.95,3.7,2089,2020
105230,6,2.0,B06XGN2J7P,B06XGN2J7P,1,Product pops up in the middle so all the food ...,2020-07-01 00:41:18.642,Not what I expected,AHGPVXYKVUEFJXFNMPCFXW2MQTKA,True,2020-06-30 20:41:18,Amazon Home,MT Products Cafeteria Food Tray White Rectangu...,"Home & Kitchen,Kitchen & Dining,Dining & Enter...",26.99,4.4,239,2020
105231,6,2.0,B07MNXFCCD,B07MNXFCCD,0,"I have to hand it to these spoon, they met the...",2020-06-24 16:50:45.709,Enjoy splinters and want your food tasting lik...,AHFSXMBSFDFSFBE6TXOJMIUTWW5A,True,2020-06-24 12:50:45,Amazon Home,Klee Utensils 3-Piece Natural Wood Kitchen Spo...,"Home & Kitchen,Kitchen & Dining,Kitchen Utensi...",6.29,4.2,59,2020


how to treat/remove outliers?

### Range Check

In [8]:
def perform_range_checks(df):
    """
    Performs range checks on numerical and text fields in the DataFrame.
    
    :param df: Pandas DataFrame containing the data.
    :return: None
    """
    logger.info('Performing range checks on numerical and text fields.')

    # Range check for ratings
    invalid_ratings = df[(df['rating'] < 1) | (df['rating'] > 5)]
    if not invalid_ratings.empty:
        logger.warning(f'Found {len(invalid_ratings)} invalid ratings outside the range 1-5.')
        display(invalid_ratings[['rating', 'text']])
    else:
        logger.info('All ratings are within the valid range (1-5).')

    # Range check for helpful votes
    negative_helpful_votes = df[df['helpful_vote'] < 0]
    if not negative_helpful_votes.empty:
        logger.warning(f'Found {len(negative_helpful_votes)} records with negative helpful votes.')
        display(negative_helpful_votes[['helpful_vote', 'text']])
    else:
        logger.info('All helpful votes are non-negative.')

    # Text field checks for empty or overly short reviews
    # decide on a limit for minimum review length, this is just temporary
    min_review_length = 10  # minimum length for review text
    short_reviews = df[df['text'].str.len() < min_review_length]
    if not short_reviews.empty:
        logger.warning(f'Found {len(short_reviews)} reviews that are shorter than {min_review_length} characters.')
        display(short_reviews[['text']])
    else:
        logger.info('All reviews meet the minimum length requirement.')

    # Text field check for title
    short_titles = df[df['title'].str.len() < min_review_length]
    if not short_titles.empty:
        logger.warning(f'Found {len(short_titles)} titles that are shorter than {min_review_length} characters.')
        display(short_titles[['title']])
    else:
        logger.info('All titles meet the minimum length requirement.')


### Data Completedness Function
Missing values, duplicates

In [10]:
def check_data_completeness(df):
    """
    Checks the completeness of the DataFrame by validating for missing values and null entries.
    
    :param df: Pandas DataFrame containing the data.
    :return: None
    """
    logger.info('Checking data completeness for missing values and null entries.')

    # Check for missing values in each column
    missing_values = df.isnull().sum()

    # Log the missing values
    for column, count in missing_values.items():
        if count > 0:
            logger.warning(f'Column "{column}" has {count} missing values.')
        else:
            logger.info(f'Column "{column}" has no missing values.')

    # Optional: Summarize total missing values
    total_missing = missing_values.sum()
    if total_missing > 0:
        logger.warning(f'Total missing values in the DataFrame: {total_missing}.')
        # You can also choose to drop or impute missing values here
        # Example of dropping missing values
        # df = df.dropna()
        # logger.info('Dropped rows with missing values.')
    else:
        logger.info('No missing values found in the DataFrame.')

    # Check for duplicates
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        logger.warning(f'Total duplicate entries found: {duplicates}.')
        # Optional: Drop duplicates
        # df = df.drop_duplicates()
        # logger.info('Dropped duplicate entries.')
    else:
        logger.info('No duplicate entries found in the DataFrame.')


Suggestions to drop or impute missing values and drop duplicates are commented out, have to check.
We can customize the actions for handling missing values based on the requirements of your project. This may include:
- Imputation: Filling missing values with a specific statistic (mean, median, mode, etc.).
- Exclusion: Removing rows with missing values.

### Consistency Checks Function

In [12]:
def check_data_consistency(df):
    """
    Checks the consistency of the DataFrame across different fields.
    
    :param df: Pandas DataFrame containing the data.
    :return: None
    """
    logger.info('Checking data consistency across different fields.')

    # Check for consistency in ASIN for products with multiple reviews
    if 'asin' in df.columns:
        inconsistent_asin_count = df.groupby('asin').filter(lambda x: len(x) > 1)['asin'].nunique()
        if inconsistent_asin_count > 0:
            logger.warning(f'Found {inconsistent_asin_count} products with inconsistent ASINs across multiple reviews.')
            display(df[df.duplicated('asin', keep=False)])  # Displaying inconsistent ASINs
        else:
            logger.info('All ASINs are consistent across reviews.')

    # Check for expected values in categorical fields
    if 'verified_purchase' in df.columns:
        expected_verified_purchase_values = ['TRUE', 'FALSE']  # assuming string format
        invalid_verified_purchase = df[~df['verified_purchase'].isin(expected_verified_purchase_values)]
        
        if not invalid_verified_purchase.empty:
            logger.warning(f'Found {len(invalid_verified_purchase)} records with invalid verified_purchase values.')
            display(invalid_verified_purchase[['verified_purchase', 'text']])
        else:
            logger.info('All verified_purchase values are valid.')

    # Additional categorical field checks can be added here (if applicable)


Expand the function to include checks for other categorical fields or consistency checks relevant to our dataset.

What to do with inconsistent ASINs or invalid verified_purchase values?

### Data Imbalance Detection

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt

def detect_data_imbalance(df):
    """
    Detects and visualizes the imbalance in the target variable (e.g., ratings) to identify class imbalance issues.
    
    :param df: Pandas DataFrame containing the data.
    :return: None
    """
    logger.info('Checking for class imbalance in the ratings.')

    if 'rating' in df.columns:
        # Count the occurrences of each rating
        rating_counts = df['rating'].value_counts()

        # Log and display the class distribution
        logger.info('Class distribution of ratings:\n%s', rating_counts)
        display(rating_counts)

        # Visualize the imbalance
        plt.figure(figsize=(8, 5))
        sns.barplot(x=rating_counts.index, y=rating_counts.values, palette="viridis")
        plt.title("Class Distribution of Ratings")
        plt.xlabel("Rating")
        plt.ylabel("Number of Reviews")
        plt.show()

        # Optionally, define a threshold for imbalance and flag it
        imbalance_threshold = 0.1  # Example: A class should not be less than 10% of the total
        total_reviews = len(df)
        minority_classes = rating_counts[rating_counts < imbalance_threshold * total_reviews]

        if not minority_classes.empty:
            logger.warning('Detected imbalance in the following classes:\n%s', minority_classes)
        else:
            logger.info('No significant class imbalance detected.')
    else:
        logger.error('Column "rating" not found in the DataFrame.')

what to do, given there is class imbalance? in the data_validation log, classes 3 and 2 seem to be imbalanced according to the   threshold (10%) set in the above code.

can easily modify this function to detect imbalances in other categorical columns or sentiment labels (???)

to handle class imbalance:
- Oversampling/Undersampling: Add more samples to the minority class (e.g., using SMOTE) or reduce samples from the majority class.
- Weighted Loss Functions: Modify your model's loss function to give more importance to the minority class during training.

### Data Privacy Compliance Check

focus on detecting PII and ensuring that sensitive information is either anonymized or excluded

In [29]:
import hashlib
import re

def detect_pii(df):
    """
    Detects potential PII fields in the DataFrame and anonymizes or flags them.

    :param df: Pandas DataFrame containing the data.
    :return: DataFrame with anonymized PII fields where necessary.
    """
    logging.info("Starting PII detection process.")

    # Step 1: Anonymize user_id (if necessary for compliance)
    if 'user_id' in df.columns:
        df['user_id'] = df['user_id'].apply(lambda x: hashlib.sha256(str(x).encode()).hexdigest())
        logging.info("user_id anonymized.")

    # Step 2: Flag review_text for potential PII patterns like emails, phone numbers
    def flag_pii_in_text(text):
        if pd.isna(text):  # Check if the input is NaN
            return 'No PII detected'  # Handle NaN gracefully

        email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
        phone_pattern = r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
        
        if re.search(email_pattern, text) or re.search(phone_pattern, text):
            logging.warning(f"Potential PII detected in text: {text}")
            return 'Potential PII detected'
        
        return 'No PII detected'
    
    if 'text' in df.columns:
        df['pii_flag'] = df['text'].apply(flag_pii_in_text)
        logging.info("PII detection in review_text completed.")

    # # Step 3: Handle timestamps (optional, if needed)
    # if 'timestamp' in df.columns:
    #     df['timestamp'] = df['timestamp'].apply(lambda x: round(x, -4))  # Example of rounding for privacy
    #     logging.info("Timestamp anonymization completed.")

    logging.info("PII detection process completed.")
    return df


If the review texts start flagging issues or require deeper scrutiny, we can improve the regex patterns or add checks for things like addresses

### Main Function to Execute All Validation Checks

In [30]:
def main_validation(df):
    """
    Main function to execute all validation steps.
    
    :param df: Pandas DataFrame containing the data.
    :return: None
    """
    logging.info("Starting main validation process.")

    # validate_data_initial(df)
    # perform_range_checks(df)
    # check_data_completeness(df)
    # check_data_consistency(df)
    # detect_data_imbalance(df)
    detect_pii(df)

    logging.info("Main validation process completed.")

# Load your data here
# df = pd.read_csv('your_data.csv')

# Run the validation
main_validation(df)
