In [1]:
import pandas as pd

# Load the CSV file
csv_file = "animals.csv"
df = pd.read_csv(csv_file)

# Display the header of the CSV file
print("Header:", df.columns.tolist())

# Display the first 5 rows of the CSV file
print("First 5 rows:\n", df.head())


Header: ['animal_type', 'name', 'age', 'color', 'months_in_shelter', 'behavior', 'health', 'vaccinated', 'target_audience']
First 5 rows:
   animal_type   name  age  color  months_in_shelter             behavior  \
0         dog  Bella    3  black                 12  friendly, energetic   
1         cat   Milo    2  white                  6    shy, affectionate   
2         dog    Max    5  brown                 24    protective, loyal   
3         cat   Luna    1   gray                  3     playful, curious   
4         dog  Rocky    4  black                  8  energetic, obedient   

      health  vaccinated     target_audience  
0  excellent        True            families  
1       good        True             singles  
2  excellent       False             couples  
3       good        True            families  
4        bad       False  active individuals  


In [2]:
# Check volume by displaying the number of rows and columns
num_rows, num_columns = df.shape
print(f"Volume of Data - Number of Records: {num_rows}, Number of Features: {num_columns}")


Volume of Data - Number of Records: 200, Number of Features: 9


In [3]:
# If the dataset has a timestamp column, e.g., 'timestamp'
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp')
    
    # Calculate velocity by finding the difference between timestamps
    df['time_diff'] = df['timestamp'].diff()
    avg_velocity = df['time_diff'].mean()
    print(f"Average time interval between records: {avg_velocity}")
else:
    print("No timestamp column found in the dataset for velocity analysis.")


No timestamp column found in the dataset for velocity analysis.


In [5]:
# Check the data types of each column to analyze variety
print("Variety of Data - Data Types of Features:")
print(df.dtypes)

# Optionally, you can also count categorical and numerical columns
num_categorical = df.select_dtypes(include=['object']).shape[1]
num_numerical = df.select_dtypes(include=['int64', 'float64']).shape[1]
print(f"Number of Categorical Features: {num_categorical}, Number of Numerical Features: {num_numerical}")


Variety of Data - Data Types of Features:
animal_type          object
name                 object
age                   int64
color                object
months_in_shelter     int64
behavior             object
health               object
vaccinated             bool
target_audience      object
dtype: object
Number of Categorical Features: 6, Number of Numerical Features: 2


In [7]:
# Check the summary statistics of numerical columns
summary_statistics = df.describe()
print("Velocity Analysis - Summary Statistics of Numerical Features:")
print(summary_statistics)

# Optionally, you can also check the unique values in categorical columns
unique_values = df.select_dtypes(include=['object']).nunique()
print("Variety Analysis - Unique Values in Categorical Features:")
print(unique_values)

Velocity Analysis - Summary Statistics of Numerical Features:
              age  months_in_shelter
count  200.000000         200.000000
mean     6.610000          25.975000
std      4.304713          17.596379
min      1.000000           1.000000
25%      3.000000           9.000000
50%      6.000000          24.000000
75%     10.000000          41.000000
max     15.000000          60.000000
Variety Analysis - Unique Values in Categorical Features:
animal_type          2
name                46
color                4
behavior           122
health               3
target_audience      5
dtype: int64


In [8]:
# Check for missing values
missing_values = df.isnull().sum()
print("Veracity Analysis - Missing Values in Each Feature:")
print(missing_values)

# Optionally, you can check for duplicated rows
num_duplicates = df.duplicated().sum()
print(f"Number of Duplicated Rows: {num_duplicates}")


Veracity Analysis - Missing Values in Each Feature:
animal_type          0
name                 0
age                  0
color                0
months_in_shelter    0
behavior             0
health               0
vaccinated           0
target_audience      0
dtype: int64
Number of Duplicated Rows: 0
