In [1]:
pip install tensorflow_data_validation



In [2]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd
import gdown
print('TFDV Version: {}'.format(tfdv.__version__))
print('Tensorflow Version: {}'.format(tf.__version__))

TFDV Version: 1.16.1
Tensorflow Version: 2.16.2


In [3]:
def load_data(file_id: str) -> pd.DataFrame:
    try:
      url = f"https://drive.google.com/uc?id={file_id}"
      file_path = "dataset.csv"
      gdown.download(url, file_path, quiet=False)
      df = pd.read_csv(file_path)
      print("Data loaded successfully.")
      return df

    except FileNotFoundError:
      print("File not found.")
      return None

In [4]:
file_id = "1zckGHmd_tJfyMqePfol0L-lIScstOCh9"
final_df = load_data(file_id)

Downloading...
From: https://drive.google.com/uc?id=1zckGHmd_tJfyMqePfol0L-lIScstOCh9
To: /content/dataset.csv
100%|██████████| 13.6M/13.6M [00:00<00:00, 54.0MB/s]


Data loaded successfully.


  df = pd.read_csv(file_path)


In [5]:
final_df.head()

Unnamed: 0.1,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,genre,song_name,Unnamed: 0,title
0,0.831,0.814,2,-7.364,1,0.42,0.0598,0.0134,0.0556,0.389,...,2Vc6NJ9PW9gD9q343XFRKx,spotify:track:2Vc6NJ9PW9gD9q343XFRKx,https://api.spotify.com/v1/tracks/2Vc6NJ9PW9gD...,https://api.spotify.com/v1/audio-analysis/2Vc6...,124539,4,Dark Trap,Mercury: Retrograde,,
1,0.719,0.493,8,-7.23,1,0.0794,0.401,0.0,0.118,0.124,...,7pgJBLVz5VmnL7uGHmRj6p,spotify:track:7pgJBLVz5VmnL7uGHmRj6p,https://api.spotify.com/v1/tracks/7pgJBLVz5Vmn...,https://api.spotify.com/v1/audio-analysis/7pgJ...,224427,4,Dark Trap,Pathology,,
2,0.85,0.893,5,-4.783,1,0.0623,0.0138,4e-06,0.372,0.0391,...,0vSWgAlfpye0WCGeNmuNhy,spotify:track:0vSWgAlfpye0WCGeNmuNhy,https://api.spotify.com/v1/tracks/0vSWgAlfpye0...,https://api.spotify.com/v1/audio-analysis/0vSW...,98821,4,Dark Trap,Symbiote,,
3,0.476,0.781,0,-4.71,1,0.103,0.0237,0.0,0.114,0.175,...,0VSXnJqQkwuH2ei1nOQ1nu,spotify:track:0VSXnJqQkwuH2ei1nOQ1nu,https://api.spotify.com/v1/tracks/0VSXnJqQkwuH...,https://api.spotify.com/v1/audio-analysis/0VSX...,123661,3,Dark Trap,ProductOfDrugs (Prod. The Virus and Antidote),,
4,0.798,0.624,2,-7.668,1,0.293,0.217,0.0,0.166,0.591,...,4jCeguq9rMTlbMmPHuO7S3,spotify:track:4jCeguq9rMTlbMmPHuO7S3,https://api.spotify.com/v1/tracks/4jCeguq9rMTl...,https://api.spotify.com/v1/audio-analysis/4jCe...,123298,4,Dark Trap,Venom,,


In [6]:
data_stats = tfdv.generate_statistics_from_dataframe(final_df)
data_stats

datasets {
  num_examples: 42305
  features {
    type: FLOAT
    num_stats {
      common_stats {
        num_non_missing: 42305
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 4230.5
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 4230.5
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 4230.5
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 4230.5
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 4230.5
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 4230.5
          }
          buckets {
            low_val

In [7]:
tfdv.visualize_statistics(data_stats)


In [8]:
# Infer schema from the computed statistics.
schema = tfdv.infer_schema(statistics=data_stats)

# Display the inferred schema
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'danceability',FLOAT,required,,-
'energy',FLOAT,required,,-
'key',INT,required,,-
'loudness',FLOAT,required,,-
'mode',INT,required,,-
'speechiness',FLOAT,required,,-
'acousticness',FLOAT,required,,-
'instrumentalness',FLOAT,required,,-
'liveness',FLOAT,required,,-
'valence',FLOAT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'type','audio_features'
'genre',"'Dark Trap', 'Emo', 'Hiphop', 'Pop', 'Rap', 'RnB', 'Trap Metal', 'Underground Rap', 'dnb', 'hardstyle', 'psytrance', 'techhouse', 'techno', 'trance', 'trap'"


In [9]:
anomalies =  tfdv.validate_statistics(statistics=data_stats, schema=schema)

# Visualize anomalies
tfdv.display_anomalies(anomalies)

In [10]:
type(anomalies)

tensorflow_metadata.proto.v0.anomalies_pb2.Anomalies

In [None]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import pandas as pd
import gdown

In [None]:
def getSchema(df):
  data_stats = tfdv.generate_statistics_from_dataframe(df)
  schema = tfdv.infer_schema(statistics=data_stats)
  tfdv.display_schema(schema)
  return schema

In [11]:
def getAnomalies(df):
  data_stats = tfdv.generate_statistics_from_dataframe(df)
  schema = tfdv.infer_schema(statistics=data_stats)
  anomalies =  tfdv.validate_statistics(statistics=data_stats, schema=schema)
  if anomalies.anomaly_info:
    # Display anomalies if present
    tfdv.display_anomalies(anomalies)
    print("Anomalies detected in the data!")
  else:
    print("No anomalies detected.")

In [None]:
def getAnomalies(df):
    # Generate data statistics from the input DataFrame, Infer a schema from statistics, and identify anomalies
    data_stats = tfdv.generate_statistics_from_dataframe(df)
    schema = tfdv.infer_schema(statistics=data_stats)
    anomalies = tfdv.validate_statistics(statistics=data_stats, schema=schema)

    # Check if any anomalies are detected
    if anomalies.anomaly_info:
        tfdv.display_anomalies(anomalies)
        print("Anomalies detected in the data!")
    else:
        print("No anomalies detected.")


In [12]:
getAnomalies(final_df)

No anomalies detected.


In [13]:
getAnomalies(load_data("1QywNP548AN2OIQo_o-goK3EBqQH3desz"))

Downloading...
From (original): https://drive.google.com/uc?id=1QywNP548AN2OIQo_o-goK3EBqQH3desz
From (redirected): https://drive.google.com/uc?id=1QywNP548AN2OIQo_o-goK3EBqQH3desz&confirm=t&uuid=342289d4-9a83-4593-9420-930dba6fa71f
To: /content/dataset.csv
100%|██████████| 301M/301M [00:08<00:00, 34.9MB/s]


Data loaded successfully.
No anomalies detected.
