In [1]:
import pandas as pd
import numpy as np

In [2]:
dog_df = pd.read_csv('allDogDescriptions.csv')
breed_info = pd.read_csv('dog_breeds.csv')
supplies_df = pd.read_csv('aliexpress_pet_supplies.csv')

In [3]:
print("Dog Descriptions columns:", list(dog_df.columns))
print(dog_df.head(1).T)
print("\nBreed Info columns:", list(breed_info.columns))
print(breed_info.head(1).T)
print("\nPet Supplies columns:", list(supplies_df.columns))
print(supplies_df.head(1).T)

Dog Descriptions columns: ['index', 'id', 'org_id', 'url', 'type.x', 'species', 'breed_primary', 'breed_secondary', 'breed_mixed', 'breed_unknown', 'color_primary', 'color_secondary', 'color_tertiary', 'age', 'sex', 'size', 'coat', 'fixed', 'house_trained', 'declawed', 'special_needs', 'shots_current', 'env_children', 'env_dogs', 'env_cats', 'name', 'status', 'posted', 'contact_city', 'contact_state', 'contact_zip', 'contact_country', 'stateQ', 'accessed', 'type.y', 'description']
                                                                 0
index                                                            0
id                                                        46042150
org_id                                                       NV163
url              https://www.petfinder.com/dog/harley-46042150/...
type.x                                                         Dog
species                                                        Dog
breed_primary                       American 

In [4]:
# Remove exact duplicate rows
dog_df = dog_df.drop_duplicates()
print("Dog df size after drop_duplicates:", dog_df.shape)

# Drop rows missing primary breed or essential fields (if any)
dog_df = dog_df.dropna(subset=['breed_primary'])
print("Dog df size after dropping missing primary breed:", dog_df.shape)

# Standardize column names: lowercase, replace spaces/dots
dog_df.columns = (dog_df.columns.str.lower()
                               .str.replace(' ', '_')
                               .str.replace('.', '_')
                               .str.replace('x$', '', regex=True))

Dog df size after drop_duplicates: (58180, 36)
Dog df size after dropping missing primary breed: (58180, 36)


In [5]:
if 'type_y' in dog_df.columns:
    dog_df.drop(columns=['type_y'], inplace=True)
dog_df.rename(columns={'type_x':'type'}, inplace=True)

In [6]:
dog_df['posted'] = pd.to_datetime(dog_df['posted'], errors='coerce')
dog_df['accessed'] = pd.to_datetime(dog_df['accessed'], errors='coerce')

In [7]:
breed_info = breed_info.drop_duplicates(subset=['Breed'])
breed_info.columns = (breed_info.columns.str.strip()
                                  .str.lower()
                                  .str.replace(' ', '_')
                                  .str.replace(r'\(.*\)', '', regex=True))

In [10]:
# Parse height to numeric (take first number)
breed_info['height_in'] = breed_info['height__in_'].str.split('-').str[0].astype(float, errors='ignore')


KeyError: 'height__in_'

In [11]:
print(breed_info.columns)

Index(['breed', 'country_of_origin', 'fur_color', 'height_', 'color_of_eyes',
       'longevity_', 'character_traits', 'common_health_problems'],
      dtype='object')


In [12]:
breed_info.columns = (breed_info.columns
                      .str.strip()
                      .str.lower()
                      .str.replace(' ', '_'))

print(breed_info.columns)  # confirm cleaned names

Index(['breed', 'country_of_origin', 'fur_color', 'height_', 'color_of_eyes',
       'longevity_', 'character_traits', 'common_health_problems'],
      dtype='object')


In [13]:
# Parse the height values
breed_info['height_in'] = (breed_info['height_']
                           .astype(str)  # make sure it's string
                           .str.split('-')  # split on dash if range
                           .str[0]          # take the first number
                           .str.extract('(\d+)')  # extract digits only
                           .astype(float))  # convert to float

  .str.extract('(\d+)')  # extract digits only


In [14]:
breed_info['height_in'] = (breed_info['height_']
                           .astype(str)                  # make sure it's string
                           .str.split('-')               # split on dash if range
                           .str[0]                       # take the first number
                           .str.extract(r'(\d+)')        # ✅ raw string for regex
                           .astype(float))               # convert to float

In [15]:
supplies_df = supplies_df.drop_duplicates()
supplies_df.columns = (supplies_df.columns.str.lower()
                                     .str.replace(' ', '_'))

In [16]:
# Clean 'wishedcount' and 'quantity'
supplies_df['quantity'] = pd.to_numeric(supplies_df['quantity'], errors='coerce')
supplies_df['wishedcount'] = pd.to_numeric(supplies_df['wishedcount'], errors='coerce')

# Extract number of items sold from 'tradeamount'
supplies_df['sold_count'] = supplies_df['tradeamount'].str.replace(r'\D', '', regex=True)
supplies_df['sold_count'] = pd.to_numeric(supplies_df['sold_count'], errors='coerce')


In [17]:
dog_df['breed_primary_clean'] = dog_df['breed_primary'].str.lower()
breed_info['breed_clean'] = breed_info['breed'].str.lower()


In [18]:
dog_merged = pd.merge(dog_df, breed_info, left_on='breed_primary_clean', right_on='breed_clean', how='left')
print("Merged dog DataFrame columns:", dog_merged.columns)


Merged dog DataFrame columns: Index(['inde', 'id', 'org_id', 'url', 'type_', 'species', 'breed_primary',
       'breed_secondary', 'breed_mixed', 'breed_unknown', 'color_primary',
       'color_secondary', 'color_tertiary', 'age', 'se', 'size', 'coat',
       'fixed', 'house_trained', 'declawed', 'special_needs', 'shots_current',
       'env_children', 'env_dogs', 'env_cats', 'name', 'status', 'posted',
       'contact_city', 'contact_state', 'contact_zip', 'contact_country',
       'stateq', 'accessed', 'description', 'breed_primary_clean', 'breed',
       'country_of_origin', 'fur_color', 'height_', 'color_of_eyes',
       'longevity_', 'character_traits', 'common_health_problems', 'height_in',
       'breed_clean'],
      dtype='object')


In [19]:
def size_category(h):
    if pd.isna(h): return np.nan
    if h <= 15: return 'Small'
    elif h <= 22: return 'Medium'
    else: return 'Large'

breed_info['size_category'] = breed_info['height_in'].apply(size_category)
dog_merged['size_category'] = pd.merge(dog_merged, breed_info[['breed_clean','size_category']],
                                      left_on='breed_primary_clean', right_on='breed_clean', how='left')['size_category']


In [20]:
supplies_df['pet_type'] = 'Other'
supplies_df.loc[supplies_df['title'].str.contains('dog', case=False, na=False), 'pet_type'] = 'Dog'
supplies_df.loc[supplies_df['title'].str.contains('cat', case=False, na=False), 'pet_type'] = 'Cat'
supplies_df['pet_type'].value_counts()


pet_type
Cat      1279
Dog       658
Other      60
Name: count, dtype: int64

In [22]:
# Example: count of pet types
print(supplies_df['pet_type'].value_counts())


pet_type
Cat      1279
Dog       658
Other      60
Name: count, dtype: int64


In [23]:
breed_info['height_in'] = (breed_info['height_']
                           .astype(str)                  # make sure it's string
                           .str.split('-')               # split on dash if range
                           .str[0]                       # take the first number
                           .str.extract(r'(\d+)')        # ✅ raw string for regex
                           .astype(float))               # convert to float

In [24]:
# Example: count of pet types
print(supplies_df['pet_type'].value_counts())

# Example: histogram of sold_count
```python
# import matplotlib.pyplot as plt
# supplies_df['sold_count'].hist(bins=20)
# plt.title("Distribution of Units Sold")
# plt.xlabel("Sold Count")
# plt.ylabel("Frequency")


SyntaxError: invalid syntax (4125285532.py, line 5)

In [25]:
breed_info['height_in'] = (breed_info['height_']
                           .astype(str)                  
                           .str.split('-')               
                           .str[0]                       
                           .str.extract(r'(\d+)')        
                           .astype(float)) 

In [26]:
supplies_df.sort_values('sold_count', ascending=False).head(5)[['title','sold_count']]


Unnamed: 0,title,sold_count
850,Pet Placemat Dog Food bowl Mat Cat Feed Mat Ca...,10000
1053,Soft Plush Pet Bed with Cover Round Cat Bed Pe...,10000
113,Dogs Interactive Toys Soft TPR Toys for Dog P...,10000
297,Ultra-Quiet Cat Water Fountain Filter Smart Au...,10000
706,Smart Dog Toy Ball Electronic Interactive Pet ...,10000


In [27]:
top_breeds = dog_merged['breed_primary'].value_counts().head(10)
print(top_breeds)


breed_primary
Pit Bull Terrier                  7890
Labrador Retriever                7198
Chihuahua                         3766
Mixed Breed                       3242
Terrier                           2641
Hound                             2282
German Shepherd Dog               2122
Boxer                             2050
Shepherd                          1972
American Staffordshire Terrier    1862
Name: count, dtype: int64


In [28]:
from collections import Counter
text = " ".join(dog_merged['description'].dropna().str.lower().values)
words = [w for w in text.split() if len(w)>3]
common = Counter(words).most_common(10)
print("Top words in descriptions:", common)


Top words in descriptions: [('with', 106600), ('will', 54966), ('adoption', 51885), ('that', 46774), ('please', 43927), ('dogs', 41185), ('have', 39385), ('this', 38948), ('home', 34000), ('your', 33712)]


In [29]:
breed_mention_counts = {}
for breed in top_breeds.index:
    cnt = supplies_df['title'].str.contains(breed, case=False, na=False).sum()
    breed_mention_counts[breed] = cnt
print(breed_mention_counts)


{'Pit Bull Terrier': 0, 'Labrador Retriever': 0, 'Chihuahua': 45, 'Mixed Breed': 0, 'Terrier': 1, 'Hound': 1, 'German Shepherd Dog': 0, 'Boxer': 0, 'Shepherd': 5, 'American Staffordshire Terrier': 0}


In [30]:
dog_merged.to_csv('cleaned_dog_descriptions.csv', index=False)
breed_info.to_csv('cleaned_dog_breeds.csv', index=False)
supplies_df.to_csv('cleaned_pet_supplies.csv', index=False)


In [31]:
# Quick stats
dogs['breed'].value_counts().head(10)   # top 10 breeds
dogs['age'].value_counts()
dogs['location'].value_counts().head(10)


NameError: name 'dogs' is not defined

In [32]:
# Load your adoption dataset
dogs = pd.read_csv("allDogDescriptions.csv")

# Peek at the data
print(dogs.head())
print(dogs.columns)

   index        id org_id                                                url  \
0      0  46042150  NV163  https://www.petfinder.com/dog/harley-46042150/...   
1      1  46042002  NV163  https://www.petfinder.com/dog/biggie-46042002/...   
2      2  46040898   NV99  https://www.petfinder.com/dog/ziggy-46040898/n...   
3      3  46039877  NV202  https://www.petfinder.com/dog/gypsy-46039877/n...   
4      4  46039306  NV184  https://www.petfinder.com/dog/theo-46039306/nv...   

  type.x species                   breed_primary breed_secondary  breed_mixed  \
0    Dog     Dog  American Staffordshire Terrier     Mixed Breed         True   
1    Dog     Dog                Pit Bull Terrier     Mixed Breed         True   
2    Dog     Dog                        Shepherd             NaN        False   
3    Dog     Dog             German Shepherd Dog             NaN        False   
4    Dog     Dog                       Dachshund             NaN        False   

   breed_unknown  ...     status

In [33]:
# Top 10 breeds
dogs['breed'].value_counts().head(10)

# Age distribution
dogs['age'].value_counts()

# Top 10 locations
dogs['location'].value_counts().head(10)


KeyError: 'breed'

In [34]:
dogs.columns


Index(['index', 'id', 'org_id', 'url', 'type.x', 'species', 'breed_primary',
       'breed_secondary', 'breed_mixed', 'breed_unknown', 'color_primary',
       'color_secondary', 'color_tertiary', 'age', 'sex', 'size', 'coat',
       'fixed', 'house_trained', 'declawed', 'special_needs', 'shots_current',
       'env_children', 'env_dogs', 'env_cats', 'name', 'status', 'posted',
       'contact_city', 'contact_state', 'contact_zip', 'contact_country',
       'stateQ', 'accessed', 'type.y', 'description'],
      dtype='object')

In [35]:
dogs['breed_primary'].value_counts().head(10)


breed_primary
Pit Bull Terrier                  7890
Labrador Retriever                7198
Chihuahua                         3766
Mixed Breed                       3242
Terrier                           2641
Hound                             2282
German Shepherd Dog               2122
Boxer                             2050
Shepherd                          1972
American Staffordshire Terrier    1862
Name: count, dtype: int64

In [36]:
dogs['age'].value_counts()


age
Adult     27955
Young     16194
Baby       9397
Senior     4634
Name: count, dtype: int64

In [37]:
dogs['sex'].value_counts()


sex
Male       30294
Female     27883
Unknown        3
Name: count, dtype: int64

In [38]:
dogs['status'].value_counts()


status
adoptable                   58147
2017-06-16T18:44:33+0000        1
2019-07-25T19:53:21+0000        1
2015-06-29T23:58:09+0000        1
2015-02-07T13:06:43+0000        1
2019-09-20T01:50:51+0000        1
2019-02-18T12:02:48+0000        1
2013-10-10T13:57:39+0000        1
2015-12-25T12:28:30+0000        1
2018-04-05T05:18:31+0000        1
2019-09-13T06:08:17+0000        1
2019-09-10T16:43:35+0000        1
2019-09-09T17:05:17+0000        1
2019-08-24T16:32:31+0000        1
2019-07-31T16:21:07+0000        1
2019-07-29T21:50:16+0000        1
2019-09-08T18:21:21+0000        1
2018-04-18T13:45:46+0000        1
2019-04-25T01:56:52+0000        1
2019-09-13T19:39:28+0000        1
2019-09-07T04:09:12+0000        1
2017-10-07T23:48:20+0000        1
2018-07-05T00:55:08+0000        1
2015-09-07T12:57:27+0000        1
2016-12-15T13:33:43+0000        1
2019-05-14T21:09:27+0000        1
2019-08-10T16:00:35+0000        1
2019-07-11T14:16:38+0000        1
2019-07-11T20:34:42+0000        1
2019-07

In [39]:
dogs['contact_city'].value_counts().head(10)


contact_city
Phoenix        756
Atlanta        710
Las Vegas      604
New York       589
Chamblee       495
Albuquerque    452
Seattle        436
Columbia       432
Washington     417
Columbus       414
Name: count, dtype: int64

In [40]:
def size_category(h):
    if h < 15:
        return 'Small'
    elif h < 25:
        return 'Medium'
    else:
        return 'Large'

breed_info['size_category'] = breed_info['height_in'].apply(size_category)
breed_info[['breed', 'height_in', 'size_category']].head()


Unnamed: 0,breed,height_in,size_category
0,Labrador Retriever,21.0,Medium
1,German Shepherd,22.0,Medium
2,Bulldog,12.0,Small
3,Poodle,10.0,Small
4,Beagle,13.0,Small


In [41]:
spending['category'].value_counts()
spending.groupby('category')['price'].mean().sort_values(ascending=False)


NameError: name 'spending' is not defined

In [42]:
spending = pd.read_csv("aliexpress_pet_supplies.csv")

# Peek at the data
print(spending.head())
print(spending.columns)

                                               title  averageStar  quantity  \
0  Mesh Litter Spatula Poop Remover Pet Cleaning ...          0.0      9993   
1  Rechargeable Mini Pet Communication Small Reco...          5.0      1986   
2  Dog Cooling Bed Mat Summer Puppy Cushion Soft ...          4.5       529   
3  Automatic Pet Feeder with Active RFID Technolo...          0.0        10   
4  Replace Plush Cat Toy Accessories Worms Replac...          4.9    102222   

   tradeAmount  wishedCount  
0       5 sold            0  
1      28 sold           64  
2      20 sold           38  
3       0 sold            1  
4  1,000+ sold          418  
Index(['title', 'averageStar', 'quantity', 'tradeAmount', 'wishedCount'], dtype='object')


In [47]:
# Load datasets
dogs = pd.read_csv("allDogDescriptions.csv")
breed_info = pd.read_csv("dog_breeds.csv")
spending = pd.read_csv("aliexpress_pet_supplies.csv")

# Check structure
print(dogs.columns)
print(breed_info.columns)
print(spending.columns)



Index(['index', 'id', 'org_id', 'url', 'type.x', 'species', 'breed_primary',
       'breed_secondary', 'breed_mixed', 'breed_unknown', 'color_primary',
       'color_secondary', 'color_tertiary', 'age', 'sex', 'size', 'coat',
       'fixed', 'house_trained', 'declawed', 'special_needs', 'shots_current',
       'env_children', 'env_dogs', 'env_cats', 'name', 'status', 'posted',
       'contact_city', 'contact_state', 'contact_zip', 'contact_country',
       'stateQ', 'accessed', 'type.y', 'description'],
      dtype='object')
Index(['Breed', 'Country of Origin', 'Fur Color', 'Height (in)',
       'Color of Eyes', 'Longevity (yrs)', 'Character Traits',
       'Common Health Problems'],
      dtype='object')
Index(['title', 'averageStar', 'quantity', 'tradeAmount', 'wishedCount'], dtype='object')


In [48]:
# Drop duplicates & NA values
dogs = dogs.drop_duplicates()
dogs = dogs.dropna(subset=['breed_primary', 'status'])



In [49]:
# Clean height and convert to numeric
breed_info['height_in'] = (breed_info['height_']
                           .astype(str)
                           .str.split('-').str[0]
                           .str.extract(r'(\d+)')
                           .astype(float))

# Create size category
def size_category(h):
    if h < 15: return 'Small'
    elif h < 25: return 'Medium'
    else: return 'Large'

breed_info['size_category'] = breed_info['height_in'].apply(size_category)


KeyError: 'height_'

In [50]:
breed_info.columns


Index(['Breed', 'Country of Origin', 'Fur Color', 'Height (in)',
       'Color of Eyes', 'Longevity (yrs)', 'Character Traits',
       'Common Health Problems'],
      dtype='object')

In [51]:
# Convert height to numeric (strip non-numeric characters if any)
breed_info['height_in'] = (
    breed_info['Height (in)']
    .astype(str)
    .str.extract(r'(\d+)')  # extract numbers
    .astype(float)
)


In [52]:
def size_category(h):
    if pd.isna(h):
        return "Unknown"
    elif h < 15:
        return "Small"
    elif 15 <= h < 25:
        return "Medium"
    else:
        return "Large"

breed_info['size_category'] = breed_info['height_in'].apply(size_category)


In [53]:
breed_info[['Breed', 'height_in', 'size_category']].head()


Unnamed: 0,Breed,height_in,size_category
0,Labrador Retriever,21.0,Medium
1,German Shepherd,22.0,Medium
2,Bulldog,12.0,Small
3,Poodle,10.0,Small
4,Beagle,13.0,Small


In [54]:
# Drop duplicates
spending = spending.drop_duplicates()

# Quick clean: remove rows with 0 trade amount or quantity
spending = spending[(spending['tradeAmount'] > 0) & (spending['quantity'] > 0)]


TypeError: '>' not supported between instances of 'str' and 'int'

In [55]:
# Ensure numeric conversion
spending['tradeAmount'] = pd.to_numeric(spending['tradeAmount'], errors='coerce')
spending['quantity'] = pd.to_numeric(spending['quantity'], errors='coerce')

spending = spending[(spending['tradeAmount'] > 0) & (spending['quantity'] > 0)]

# Drop duplicates
spending = spending.drop_duplicates()

# Convert to numeric
spending['tradeAmount'] = pd.to_numeric(spending['tradeAmount'], errors='coerce')
spending['quantity'] = pd.to_numeric(spending['quantity'], errors='coerce')

# Quick clean: remove rows with 0 trade amount or quantity
spending = spending[(spending['tradeAmount'] > 0) & (spending['quantity'] > 0)]


In [56]:
dogs['breed_primary'].value_counts().head(10)    # Top breeds
dogs['age'].value_counts()                       # Age distribution
dogs['sex'].value_counts()                       # Gender distribution
dogs['status'].value_counts()                    # Adoption status
dogs['contact_city'].value_counts().head(10)     # Top adoption cities


contact_city
Phoenix        756
Atlanta        710
Las Vegas      604
New York       589
Chamblee       495
Albuquerque    452
Seattle        436
Columbia       432
Washington     417
Columbus       414
Name: count, dtype: int64

In [57]:
breed_info['size_category'].value_counts()       # Size category distribution
breed_info.groupby('size_category')['longevity_'].mean()


KeyError: 'Column not found: longevity_'

In [58]:
# Standardize column names
breed_info = breed_info.rename(columns={
    'Height (in)': 'height_in',
    'Longevity (yrs)': 'longevity_'
})


In [59]:
breed_info['size_category'].value_counts()       # Size category distribution
breed_info.groupby('size_category')['longevity_'].mean()


TypeError: agg function failed [how->mean,dtype->object]

In [60]:
# Clean longevity column (convert ranges like "8-10" into average)
breed_info['longevity_'] = (
    breed_info['longevity_']
    .astype(str)
    .str.extract(r'(\d+)-?(\d+)?')  # capture first and optional second number
)

# Convert to numeric and average if there are two numbers
breed_info['longevity_'] = breed_info['longevity_'].apply(
    lambda x: (pd.to_numeric(x[0]) + pd.to_numeric(x[1])) / 2 if pd.notnull(x[1]) else pd.to_numeric(x[0])
)


ValueError: Columns must be same length as key

In [61]:
# Extract numbers from longevity ranges like "8-10"
longevity_nums = breed_info['longevity_'].astype(str).str.extract(r'(\d+)-?(\d+)?')

# Convert both parts to numeric
longevity_nums = longevity_nums.apply(pd.to_numeric, errors='coerce')

# Take average if two numbers exist, else just the first
breed_info['longevity_'] = longevity_nums.apply(
    lambda x: (x[0] + x[1]) / 2 if pd.notnull(x[1]) else x[0],
    axis=1
)


In [62]:
# Clean longevity column (convert ranges like "8-10" into average)
breed_info['longevity_'] = (
    breed_info['longevity_']
    .astype(str)
    .str.extract(r'(\d+)-?(\d+)?')  # capture first and optional second number
)

# Convert to numeric and average if there are two numbers
breed_info['longevity_'] = breed_info['longevity_'].apply(
    lambda x: (pd.to_numeric(x[0]) + pd.to_numeric(x[1])) / 2 if pd.notnull(x[1]) else pd.to_numeric(x[0])
)


ValueError: Columns must be same length as key

In [63]:
# Size category distribution
print(breed_info['size_category'].value_counts())

# Average longevity by size
print(breed_info.groupby('size_category')['longevity_'].mean())


size_category
Medium    53
Small     46
Large     18
Name: count, dtype: int64
size_category
Large     10.444444
Medium    12.556604
Small     13.315217
Name: longevity_, dtype: float64


In [64]:
# --- Clean Height ---
# Extract numbers from ranges like "10-12"
height_nums = breed_info['Height (in)'].astype(str).str.extract(r'(\d+)-?(\d+)?')

# Convert to numeric
height_nums = height_nums.apply(pd.to_numeric, errors='coerce')

# Take average if two numbers exist, else just the first
breed_info['height_in'] = height_nums.apply(
    lambda x: (x[0] + x[1]) / 2 if pd.notnull(x[1]) else x[0],
    axis=1
)

# --- Clean Longevity ---
# Extract numbers from ranges like "8-10"
longevity_nums = breed_info['longevity_'].astype(str).str.extract(r'(\d+)-?(\d+)?')

# Convert to numeric
longevity_nums = longevity_nums.apply(pd.to_numeric, errors='coerce')

# Take average if two numbers exist, else just the first
breed_info['longevity_'] = longevity_nums.apply(
    lambda x: (x[0] + x[1]) / 2 if pd.notnull(x[1]) else x[0],
    axis=1
)

# --- Define Size Category based on height ---
def size_category(h):
    if pd.isnull(h): 
        return "Unknown"
    elif h < 15:
        return "Small"
    elif h < 25:
        return "Medium"
    else:
        return "Large"

breed_info['size_category'] = breed_info['height_in'].apply(size_category)

# --- Check results ---
print(breed_info[['Breed', 'height_in', 'size_category', 'longevity_']].head())


KeyError: 'Height (in)'