In [None]:
!pip install --upgrade 'tensorflow_data_validation[visualization]<2'

In [1]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd

## original dataset

In [None]:
user_reviews = []
with open("/content/australian_user_reviews.json") as f:
    for line in f:
        json_data = eval(line)

        user_id = json_data['user_id']
        user_url = json_data['user_url']
        # Process each review
        for review in json_data['reviews']:
            row = {
                'user_id': user_id,
                'user_url': user_url,
                'item_id': review['item_id'],
                'review': review['review'],
                'recommend': review['recommend'],
                'posted': review['posted'],
                'last_edited': review['last_edited'],
                'funny': review['funny'],
                'helpful': review['helpful'],
            }
            user_reviews.append(row)

user_reviews = pd.DataFrame(user_reviews)

In [None]:
user_reviews_stats = tfdv.generate_statistics_from_dataframe(user_reviews)
tfdv.visualize_statistics(user_reviews_stats)

In [3]:
class DataQualityEvaluator:
    def __init__(self):
        pass

    def analyze_dataset(self, data):
        # Generate and return descriptive statistics for the dataset
        stats = tfdv.generate_statistics_from_dataframe(data)
        return stats

    def evaluate_data_quality(self, data):
        # Generate and return a data quality evaluation report
        # This report includes checks for missing values, anomalies, and duplicates
        stats = tfdv.generate_statistics_from_dataframe(data)
        schema = tfdv.infer_schema(stats)
        anomalies = tfdv.validate_statistics(stats, schema)
        return anomalies

if __name__ == "__main__":
    # Example usage
    evaluator = DataQualityEvaluator()

    # Load your dataset into a pandas DataFrame
    data = pd.read_csv("/content/sample_data/california_housing_train.csv")
    #data = user_reviews

    # Perform dataset analysis
    stats = evaluator.analyze_dataset(data)
    print("Dataset Descriptive Statistics:")
    print(stats)

    # Perform data quality evaluation
    anomalies = evaluator.evaluate_data_quality(data)
    print("Data Quality Evaluation Report:")
    print(anomalies)

Dataset Descriptive Statistics:
datasets {
  num_examples: 17000
  features {
    type: FLOAT
    num_stats {
      common_stats {
        num_non_missing: 17000
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 1700.0
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 1700.0
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 1700.0
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 1700.0
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 1700.0
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 1700.0
          }
       

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

X = user_reviews
def classify_column_types(df):
    numerical_columns = []
    categorical_columns = []
    datetime_columns = []
    short_text_columns = []
    long_text_columns = []

    unique_threshold = 0.2  # Threshold for unique values to consider a column categorical
    word_count_threshold = 30  # Threshold for word count to distinguish between short and long text

    # First, identify columns that can be parsed as datetime
    for col in df.columns:
        # Sample non-empty values for efficiency
        sample_size = min(1000, len(df[col].dropna()))
        sample_data = df[col].dropna().sample(n=sample_size)

        converted_sample = pd.to_datetime(sample_data, errors='coerce')
        # If all sampled non-empty values can be parsed as datetime, consider the column as datetime
        if not converted_sample.isna().any():
            if sample_data.dtype in ['int64', 'float64']:
                # For numeric columns, check if values are in the typical Unix timestamp range
                if sample_data.between(1e9, 2e9).any() and "time" in col.lower():
                    datetime_columns.append(col)
                    continue  # Skip further checks for this column
            else:
                # Non-numeric columns parsed as datetime are added to datetime columns
                datetime_columns.append(col)
                continue

        # For non-datetime columns, determine other types
        if df[col].dtype in ['int64', 'float64']:
            # Determine if a numeric column is categorical based on unique values and threshold
            proportion_unique = df[col].nunique() / df[col].notnull().sum()
            if df[col].nunique() <= 100 and proportion_unique <= unique_threshold:
                categorical_columns.append(col)
            else:
                numerical_columns.append(col)

        elif df[col].dtype in ['bool', 'category']:
            categorical_columns.append(col)

        elif df[col].dtype in ['object', 'string']:
            df[col] = df[col].astype(str).replace('nan', np.nan)  # Ensure all data is string for analysis
            sample_data = sample_data.astype(str)  # Ensure sample data is string for word count
            proportion_unique = df[col].nunique() / df[col].notnull().sum()
            if df[col].nunique() <= 100 and proportion_unique <= unique_threshold:
                categorical_columns.append(col)
            else:
                # Calculate word count for text analysis
                word_counts = sample_data.str.split().str.len().fillna(0)
                max_word_count = word_counts.max()
                if max_word_count <= word_count_threshold:
                    short_text_columns.append(col)
                else:
                    long_text_columns.append(col)

    return numerical_columns, categorical_columns, datetime_columns, short_text_columns, long_text_columns


numerical_columns, categorical_columns, datetime_columns, short_text_columns, long_text_columns = classify_column_types(X)
text_columns = short_text_columns + long_text_columns
print("Numerical columns: ", numerical_columns, "\n")
print("Categorical columns: ", categorical_columns, "\n")
print("DateTime columns:", datetime_columns, "\n")
print("Short text columns:", short_text_columns, "\n")
print("Long text columns:", long_text_columns, "\n")
print("Text columns: ", text_columns, "\n")
print("Data after classifying columns:\n", X.tail(10), "\n")

Numerical columns:  [] 

Categorical columns:  ['recommend'] 

DateTime columns: [] 

Short text columns: ['user_id', 'user_url', 'item_id', 'posted', 'last_edited', 'funny', 'helpful'] 

Long text columns: ['review'] 

Text columns:  ['user_id', 'user_url', 'item_id', 'posted', 'last_edited', 'funny', 'helpful', 'review'] 

Data after classifying columns:
                  user_id                                           user_url  \
59295  76561198306599751  http://steamcommunity.com/profiles/76561198306...   
59296           Ghoustik              http://steamcommunity.com/id/Ghoustik   
59297  76561198310819422  http://steamcommunity.com/profiles/76561198310...   
59298  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
59299  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
59300  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
59301  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
59302   

## corruption

In [None]:
# Randomly insert duplicates
def insert_duplicates(df):
    # Add a column to preserve the original order of the dataframe
    df['original_order'] = range(len(df))

    num_duplicates = np.random.randint(1, 10001)
    num_duplicates = min(num_duplicates, len(df))  # Ensure the number of duplicates does not exceed the original size
    duplicates = df.sample(n=num_duplicates)  # Randomly select records to duplicate

    # Assign new 'original_order' positions for duplicates to insert them at random positions
    duplicates['original_order'] = np.random.choice(df['original_order'], size=num_duplicates, replace=False)

    df_modified = pd.concat([df, duplicates])

    # Sort the modified dataframe by 'original_order' to mix in the duplicates at their new positions
    df_modified = df_modified.sort_values(by='original_order').reset_index(drop=True)
    df_modified.drop(columns=['original_order'], inplace=True)  # Drop the 'original_order' column
    df.drop(columns=['original_order'], inplace=True)  # Drop the 'original_order' column from the original DataFrame

    return df_modified


# Randomly introduce missing values
def introduce_nan(df, numerical_columns, categorical_columns, datetime_columns, text_columns, missing_ratio=0.02):
    df_modified = df.copy()  # Create a copy of the DataFrame to modify

    # Randomly select one column from each type of columns
    selected_num_col = np.random.choice(numerical_columns) if numerical_columns else None
    selected_cat_col = np.random.choice(categorical_columns) if categorical_columns else None
    selected_dt_col = np.random.choice(datetime_columns) if datetime_columns else None
    selected_text_col = np.random.choice(text_columns) if text_columns else None

    for col in [selected_num_col, selected_cat_col, selected_dt_col, selected_text_col]:
        if col is not None:
            total_values = len(df_modified)  # Total number of entries in the column
            existing_missing = df_modified[col].isnull().sum()  # Count existing missing values
            # Calculate the number of new missing values to introduce based on the specified ratio
            new_missing_count = int(total_values * missing_ratio) - existing_missing
            new_missing_count = max(new_missing_count, 0)  # Ensure new_missing_count is non-negative

            if new_missing_count > 0:
                # Get the indices of non-missing values
                non_missing_indices = df_modified[col][df_modified[col].notnull()].index.tolist()
                # Randomly select indices to introduce missing values
                missing_indices = np.random.choice(non_missing_indices, size=new_missing_count, replace=False)
                # Set the selected indices to NaN
                df_modified.loc[missing_indices, col] = np.nan

    return df_modified


# Randomly introduce outliers
def introduce_outliers(df, numerical_columns):
    df_modified = df.copy()

    # Ensure the number of outliers to introduce does not exceed one-tenth of the original DataFrame
    num_outliers = np.random.randint(1, 10001)
    num_outliers = min(num_outliers, len(df_modified)//10)

    for col in numerical_columns:
        # Calculate the max and min values for the column
        col_max = df_modified[col].max()
        col_min = df_modified[col].min()

        # Check if the entire column can be considered of integer type before looping
        is_integer = (df_modified[col].dropna() % 1 == 0).all()

        # Select random indices to introduce outliers
        outlier_indices = np.random.choice(df_modified.index, size=num_outliers, replace=False)
        for idx in outlier_indices:
            # Generate a random factor between 2 and 10
            random_factor = np.random.uniform(2, 10)
            # Randomly decide to set a high or low outlier value
            if np.random.rand() > 0.5:
                df_modified.at[idx, col] = col_max * random_factor  # Multiply the max by a random factor
            else:
                df_modified.at[idx, col] = col_min / random_factor  # Divide the min by a random factor

        # If the column is of integer type, convert the entire column to int after introducing all outliers
        if is_integer:
            df_modified[col] = df_modified[col].apply(lambda x: int(x) if pd.notnull(x) else x).astype('Int64')

    return df_modified


# Data corruption
X_random_nans = introduce_nan(X, numerical_columns, categorical_columns, datetime_columns, text_columns)
X_random_nans = introduce_nan(X_random_nans, numerical_columns, categorical_columns, datetime_columns, text_columns)
X_random_outliers = introduce_outliers(X_random_nans, numerical_columns)
X_random_outliers = introduce_outliers(X_random_outliers, numerical_columns)
X_random_duplicates = insert_duplicates(X_random_outliers)
X_random_duplicates = insert_duplicates(X_random_duplicates)

X_corruption = X_random_duplicates
print("Corrupted data:\n", X_random_duplicates.tail(10), "\n")

Corrupted data:
                  user_id                                           user_url  \
63595           Ghoustik              http://steamcommunity.com/id/Ghoustik   
63596  76561198310819422  http://steamcommunity.com/profiles/76561198310...   
63597  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
63598  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
63599  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
63600  76561198312638244  http://steamcommunity.com/profiles/76561198312...   
63601             ysarge                http://steamcommunity.com/id/ysarge   
63602        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   
63603        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   
63604        LydiaMorley           http://steamcommunity.com/id/LydiaMorley   

      item_id                                             review recommend  \
63595     730  Gra naprawdę fajna.A

In [None]:
user_reviews_corruption_stats = tfdv.generate_statistics_from_dataframe(X_corruption)
tfdv.visualize_statistics(user_reviews_corruption_stats)

In [None]:
schema = tfdv.infer_schema(statistics=user_reviews_corruption_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'user_id',BYTES,required,,-
'user_url',BYTES,required,,-
'item_id',BYTES,required,,-
'review',BYTES,optional,single,-
'recommend',INT,optional,single,-
'posted',BYTES,required,,-
'last_edited',BYTES,optional,single,-
'funny',BYTES,required,,-
'helpful',BYTES,required,,-


In [None]:
anomalies = tfdv.validate_statistics(statistics=user_reviews_corruption_stats, schema=schema)
tfdv.display_anomalies(anomalies)

## cleaning

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import Pipeline
from timeit import default_timer as timer
import logging
logger = logging.getLogger('dataCleaning')


def handle_duplicates(df):
    logger.info('Started handling of duplicates...')
    start_time = timer()
    # Record the number of rows before handling duplicates
    rows_before = df.shape[0]
    try:
        # Remove duplicate rows and reset index
        df = df.drop_duplicates().reset_index(drop=True)
        rows_after = df.shape[0]
        num_duplicates = rows_before - rows_after
        # Log the outcome of duplicates handling
        if num_duplicates > 0:
            logger.debug(f'Deletion of {num_duplicates} duplicate(s) succeeded')
        else:
            logger.debug('No duplicates found')
        end_time = timer()
        logger.info(f'Handling of duplicates completed in {end_time - start_time:.5f} seconds')
    except:
        logger.warning('Handling of duplicates failed')

    return df


def handle_missing_values(df, numerical_columns, categorical_columns, datetime_columns, short_text_columns, long_text_columns):
    logger.info('Started handling of missing values...')
    start_time = timer()
    num_missing = df.isnull().sum().sum()  # Calculate the total number of missing values
    logger.info(f'Found a total of {num_missing} missing value(s)')

    # Fill missing values in text columns with an empty string immediately
    for col in (short_text_columns + long_text_columns):
        if df[col].isnull().any():  # Check if the column has any missing values
            logger.info(f'Processing missing values for column: {col}')
            df[col] = df[col].fillna('')
            logger.info(f'Imputed missing values for column: {col}')

    # Convert datetime columns to Unix timestamps
    for col in datetime_columns:
        if df[col].dtype in ['int64', 'float64']:
            df[col] = pd.to_datetime(df[col], unit='s', errors='coerce')
        else:
            df[col] = pd.to_datetime(df[col], infer_datetime_format=True, errors='coerce')

        df[col] = [int(dt.timestamp()) if pd.notnull(dt) else np.nan for dt in df[col]]

    # Loop through numerical, categorical and datetime columns to handle missing values
    for col in (numerical_columns + categorical_columns + datetime_columns):
        if df[col].isnull().any():
            logger.info(f'Processing missing values for column: {col}')

            X = df.copy()  # Create a copy of the DataFrame to avoid modifying the original
            y = X.pop(col)
            X_train = X[y.notnull()]
            y_train = y[y.notnull()]
            X_test = X[y.isnull()]

            transformers = []  # Initialize a list to store transformers

            # Define a transformer for categorical features
            categorical_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
            ])

            # Append transformers for different types of features, excluding the current target column
            transformers.append(('num', KNNImputer(n_neighbors=3), [c for c in (numerical_columns + datetime_columns) if c != col]))
            transformers.append(('categ', categorical_transformer, [c for c in categorical_columns if c != col]))
            # Add a separate transformer for each text column
            for c in short_text_columns:
                transformers.append((f'text_{c}', TfidfVectorizer(max_features=500), c))
            for c in long_text_columns:
                transformers.append((f'text_{c}', TfidfVectorizer(max_features=1000), c))

            # ColumnTransformer to apply the appropriate transformations to each column type
            feature_transformer = ColumnTransformer(transformers=transformers, remainder="drop")

            # Use predictive modeling for numerical, categorical and datetime columns
            if col in (numerical_columns + datetime_columns):
                pipeline = Pipeline(steps=[
                    ('features', feature_transformer),
                    ('learner', RandomForestRegressor())
                ])
                final_model = pipeline.fit(X_train, y_train)  # Fit the pipeline to the training data
                predicted_values = final_model.predict(X_test)  # Predict missing values for numerical columns
                df.loc[y.isnull(), col] = predicted_values  # Impute the predicted values into the original DataFrame

                if col in datetime_columns:
                    df[col] = pd.to_datetime(df[col], unit='s')

            else:
                le = LabelEncoder()  # Encode labels for categorical target
                encoded_y = le.fit_transform(y_train)

                pipeline = Pipeline(steps=[
                    ('features', feature_transformer),
                    ('learner', RandomForestClassifier())
                ])
                final_model = pipeline.fit(X_train, encoded_y)
                predicted_values = final_model.predict(X_test)
                predicted_values = le.inverse_transform(predicted_values)  # Decode the predictions
                df.loc[y.isnull(), col] = predicted_values

            logger.info(f'Imputed missing values for column: {col}')

    end_time = timer()
    logger.info(f'Handling of missing values completed in {end_time - start_time:.5f} seconds')

    return df


# Function for outlier winsorization
def handle_outliers(df, numerical_columns):
    logger.info('Started handling of outliers...')
    start_time = timer()

    for col in numerical_columns:
        # Calculate the bounds for identifying outliers
        outlier_param = 1.5
        q1, q3 = np.percentile(df[col].dropna(), [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - (outlier_param * iqr)
        upper_bound = q3 + (outlier_param * iqr)

        # Check if the entire column can be considered of integer type before looping
        is_integer = (df[col].dropna() % 1 == 0).all()

        counter = 0  # Initialize a counter to track the number of outliers handled
        for row_index, row_val in enumerate(df[col]):
            # Skip NaN values
            if pd.isna(row_val):
                continue

            # Check if the value is an outlier (outside the bounds)
            if row_val < lower_bound or row_val > upper_bound:
                # Replace outliers with the corresponding bound
                df.at[row_index, col] = lower_bound if row_val < lower_bound else upper_bound
                counter += 1

        # If the column is of integer type, convert the entire column to int after handling all outliers
        if is_integer:
            df[col] = df[col].apply(lambda x: int(x) if pd.notnull(x) else x).astype('Int64')

        if counter > 0:
            logger.debug(f'Outlier imputation of {counter} value(s) succeeded for numerical column "{col}"')

    end_time = timer()
    logger.info(f'Handling of outliers completed in {end_time - start_time:.5f} seconds')

    return df

In [None]:
!pip install sentence_transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from timeit import default_timer as timer
import logging
logger = logging.getLogger('dataPreprocessing')


# Function for encoding of categorical features in the data
def encode_categorical(df, categorical_columns):
    logger.info('Started encoding of CATEGORICAL columns...')
    start_time = timer()

    features_df = pd.DataFrame(index=df.index)  # Initialize a new DataFrame for added features
    one_hot_encoded_frames = []  # Store DataFrames with one-hot encoded columns to be joined later

    for col in categorical_columns:
        try:
            # Convert column to 'category' data type
            temp_col = df[col].astype('category')

            # Perform OneHot Encoding if the column has 10 or fewer unique values
            if temp_col.nunique() <= 10:
                one_hot = pd.get_dummies(temp_col, prefix=col)
                one_hot_encoded_frames.append(one_hot)  # Append the one-hot encoded DataFrame to the list

                logger.debug(f'OneHot Encoding succeeded for column "{col}"')
            # Perform Label Encoding if there are more than 10 unique values in the column
            else:
                features_df[col + '_label'] = temp_col.cat.codes

                logger.debug(f'Label Encoding succeeded for column "{col}"')
        except:
            logger.warning(f'Encoding failed for column "{col}"')

    # Join all one-hot encoded frames to the new features DataFrame
    if one_hot_encoded_frames:
        features_df = features_df.join(one_hot_encoded_frames)

    end_time = timer()
    logger.info(f'Completed encoding of CATEGORICAL columns in {end_time - start_time:.5f} seconds')

    return features_df


# Function for extracting of datetime values in the data
def convert_datetime(df, datetime_columns):
    logger.info('Started conversion of DATETIME columns...')
    start_time = timer()

    features_df = pd.DataFrame(index=df.index)  # Initialize a new DataFrame for added features
    for col in datetime_columns:
        try:
            # Convert columns with Unix timestamps to datetime format
            if df[col].dtype in ['int64', 'float64']:
                temp_col = pd.to_datetime(df[col], unit='s')
            # For other columns, try to infer the datetime format automatically
            else:
                temp_col = pd.to_datetime(df[col], infer_datetime_format=True)

            # Extract and add new columns for year, month, day, and weekday from the datetime column to features_df
            features_df[f'{col}_year'] = temp_col.dt.year
            features_df[f'{col}_month'] = temp_col.dt.month
            features_df[f'{col}_day'] = temp_col.dt.day
            features_df[f'{col}_weekday'] = temp_col.dt.weekday

            # Perform sinusoidal encoding for the weekday to capture its cyclical nature
            features_df[f'{col}_weekday_sin'] = np.sin(2 * np.pi * features_df[f'{col}_weekday'] / 7)
            features_df[f'{col}_weekday_cos'] = np.cos(2 * np.pi * features_df[f'{col}_weekday'] / 7)

            logger.debug(f'Conversion to DATETIME succeeded for column "{col}"')

            try:
                # Check if extracted dates are non-NULL; if all are 0, plan to drop the added columns
                check_features = [f'{col}_{feature}' for feature in ['year', 'month', 'day']]
                if all((features_df[feature] == 0).all() for feature in check_features):
                    drop_features = [f'{col}_{feature}' for feature in
                                     ['year', 'month', 'day', 'weekday', 'weekday_sin', 'weekday_cos']]
                    features_df.drop(columns=drop_features, inplace=True)
            except:
                pass
        except:
            logger.warning(f'Conversion to DATETIME failed for column "{col}"')

    end_time = timer()
    logger.info(f'Completed conversion of DATETIME columns in {end_time - start_time:.5f} seconds')

    return features_df


# Function for extracting features from text columns in the data
def extract_text_features(df, text_columns):
    logger.info('Started extraction of features from TEXT columns...')
    start_time = timer()

    model = SentenceTransformer("all-MiniLM-L6-v2")  # Load the pre-trained model for text embeddings
    pca = PCA(n_components=100)  # Initialize PCA for dimensionality reduction

    features_df = pd.DataFrame(index=df.index)  # Initialize a DataFrame to hold all new features
    for col in text_columns:
        try:
            # Encode the text data to get the embeddings, and directly convert to float32 to save memory
            embeddings = model.encode(df[col]).astype('float32')

            # Apply PCA on embeddings and reduce memory usage by converting to float32
            reduced_embeddings = pca.fit_transform(embeddings).astype('float32')

            # Create new columns for each principal component in the separate features DataFrame
            for i in range(100):
                features_df[f'{col}_PC{i+1}'] = reduced_embeddings[:, i]

            # Aggregate features based on embeddings and add them to the features DataFrame
            features_df[f'{col}_embedding_mean'] = np.mean(embeddings, axis=1)  # Mean across each dimension
            features_df[f'{col}_embedding_max'] = np.max(embeddings, axis=1)  # Max across each dimension
            features_df[f'{col}_embedding_min'] = np.min(embeddings, axis=1)  # Min across each dimension
            features_df[f'{col}_embedding_std'] = np.std(embeddings, axis=1)  # Standard deviation across each dimension
            features_df[f'{col}_embedding_median'] = np.median(embeddings, axis=1)  # Median across each dimension
            features_df[f'{col}_embedding_l2'] = np.linalg.norm(embeddings, axis=1)  # L2 norm (Euclidean) of each embedding

            logger.debug(f'Feature extraction succeeded for column "{col}"')
        except:
            logger.warning(f'Feature extraction failed for column "{col}"')

    end_time = timer()
    logger.info(f'Completed extraction of features from TEXT columns in {end_time - start_time:.5f} seconds')

    return features_df

In [None]:
# Data cleaning
X_random_duplicates = X_random_duplicates.dropna(how='all').reset_index(drop=True)

X_distinct = handle_duplicates(X_random_duplicates)
print("Data after handling duplicates:\n", X_distinct.tail(10), "\n")
X_without_outliers = handle_outliers(X_distinct, numerical_columns)
print("Data after handling outliers:\n", X_without_outliers.tail(10), "\n")
X_cleaned = handle_missing_values(X_without_outliers, numerical_columns, categorical_columns, datetime_columns,
                                  short_text_columns, long_text_columns)
print("Data after cleaning:\n", X_cleaned.tail(10), "\n")

Data after handling duplicates:
                  user_id  \
58510  76561198306599751   
58511           Ghoustik   
58512  76561198310819422   
58513  76561198312638244   
58514  76561198312638244   
58515  76561198312638244   
58516  76561198312638244   
58517        LydiaMorley   
58518        LydiaMorley   
58519        LydiaMorley   

                                                   user_url item_id  \
58510  http://steamcommunity.com/profiles/76561198306599751  261030   
58511                 http://steamcommunity.com/id/Ghoustik     730   
58512  http://steamcommunity.com/profiles/76561198310819422     570   
58513  http://steamcommunity.com/profiles/76561198312638244  233270   
58514  http://steamcommunity.com/profiles/76561198312638244     130   
58515  http://steamcommunity.com/profiles/76561198312638244      70   
58516  http://steamcommunity.com/profiles/76561198312638244  362890   
58517              http://steamcommunity.com/id/LydiaMorley  273110   
58518              

In [None]:
user_reviews_cleaned_stats = tfdv.generate_statistics_from_dataframe(X_cleaned)
tfdv.visualize_statistics(user_reviews_cleaned_stats)

In [None]:
anomalies = tfdv.validate_statistics(statistics=user_reviews_cleaned_stats, schema=schema)
tfdv.display_anomalies(anomalies)

In [None]:
tfdv.visualize_statistics(lhs_statistics=user_reviews_corruption_stats, rhs_statistics=user_reviews_cleaned_stats,
                          lhs_name='corruption_dataset', rhs_name='cleaned_dataset')

## model

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

### use origin dataset to train baseline model

In [None]:
# Data preprocessing
categorical_features_df = encode_categorical(X, categorical_columns)
print("Categorical features encoded:\n", categorical_features_df.tail(10), "\n")
datetime_features_df = convert_datetime(X, datetime_columns)
print("DateTime features extracted:\n", datetime_features_df.tail(10), "\n")
text_features_df = extract_text_features(X, text_columns)
print("Text features extracted:\n", text_features_df.tail(10), "\n")

# Concatenate all DataFrames horizontally (axis=1) to form a complete feature set
modeling_df = pd.concat([X[numerical_columns], categorical_features_df,
                         datetime_features_df, text_features_df], axis=1)
print("Data used for modeling:\n", modeling_df.tail(10), "\n")


# Model training and evaluation
logger.info('Started model training and evaluation...')
start_time = timer()
selected_col = np.random.choice(categorical_columns) if categorical_columns else None
if selected_col is not None:
    print("Selected column for classification: ", selected_col, "\n")

    # Identify columns in modeling_df that start with "selected_col_"
    cols_to_drop = [col for col in modeling_df.columns if col.startswith(f'{selected_col}_')]

    X_train = modeling_df.drop(columns=cols_to_drop)  # Drop these columns from modeling_df
    y_train = X[selected_col]  # Set y_train to the column from X_cleaned corresponding to selected_col

    # Fit and transform y to have consecutive class labels
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)

    clf = RandomForestClassifier()

    # Define a grid of hyperparameter values for tuning the classifier
    param_grid = {
        'n_estimators': [200],  # [100, 200, 300, 400, 500]
        'max_depth': [None],  # [None, 10, 20, 30, 40, 50]
        'min_samples_split': [2],  # [2, 5, 10]
        'min_samples_leaf': [1],  # [1, 2, 4]
        'max_features': ['sqrt'],  # ['auto', 'sqrt', 'log2']
        'bootstrap': [True],  # [True, False]
        'criterion': ['gini'],  # ['gini', 'entropy']
    }

    # Set up GridSearchCV to find the best model parameters using 5-fold cross-validation
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')

    grid_search.fit(X_train, y_train)

    # Extract the best model
    best_model = grid_search.best_estimator_  # the best estimator (the trained model with the best parameters)
    logger.info(f'Final model:\n {best_model}')

    # Access the best parameters and the best score after fitting
    print("Best parameters found: ", grid_search.best_params_)
    print("Best accuracy found: ", grid_search.best_score_)
    print("Average accuracy: ", grid_search.cv_results_['mean_test_score'].mean())

Categorical features encoded:
        recommend_False  recommend_True
59295                0               1
59296                0               1
59297                0               1
59298                0               1
59299                0               1
59300                0               1
59301                0               1
59302                0               1
59303                0               1
59304                0               1 

DateTime features extracted:
 Empty DataFrame
Columns: []
Index: [59295, 59296, 59297, 59298, 59299, 59300, 59301, 59302, 59303, 59304] 



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  features_df[f'{col}_embedding_mean'] = np.mean(embeddings, axis=1)  # Mean across each dimension
  features_df[f'{col}_embedding_max'] = np.max(embeddings, axis=1)  # Max across each dimension
  features_df[f'{col}_embedding_min'] = np.min(embeddings, axis=1)  # Min across each dimension
  features_df[f'{col}_embedding_std'] = np.std(embeddings, axis=1)  # Standard deviation across each dimension
  features_df[f'{col}_embedding_median'] = np.median(embeddings, axis=1)  # Median across each dimension
  features_df[f'{col}_embedding_l2'] = np.linalg.norm(embeddings, axis=1)  # L2 n

Text features extracted:
        user_id_PC1  user_id_PC2  user_id_PC3  user_id_PC4  user_id_PC5  \
59295    -0.530149    -0.016263     0.005980     0.034567    -0.002278   
59296     0.379025    -0.100167     0.138776     0.168733     0.022356   
59297    -0.499301     0.011519     0.015179     0.037321    -0.056216   
59298    -0.501930    -0.039187    -0.013999     0.008977    -0.017334   
59299    -0.501929    -0.039188    -0.013999     0.008977    -0.017334   
59300    -0.501929    -0.039188    -0.013999     0.008977    -0.017334   
59301    -0.501929    -0.039188    -0.013999     0.008977    -0.017334   
59302     0.330307    -0.008449    -0.000802     0.017280    -0.039840   
59303     0.330307    -0.008449    -0.000802     0.017280    -0.039840   
59304     0.330307    -0.008449    -0.000802     0.017280    -0.039840   

       user_id_PC6  user_id_PC7  user_id_PC8  user_id_PC9  user_id_PC10  ...  \
59295     0.011823     0.025235    -0.050870    -0.033611     -0.000167  ...   