In [11]:
import pandas as pd

# Loading dataset
df = pd.read_csv("anime-dataset-2023.csv")

print(df.head())

print(df.info())

print(df.describe())


   anime_id                             Name             English name  \
0         1                     Cowboy Bebop             Cowboy Bebop   
1         5  Cowboy Bebop: Tengoku no Tobira  Cowboy Bebop: The Movie   
2         6                           Trigun                   Trigun   
3         7               Witch Hunter Robin       Witch Hunter Robin   
4         8                   Bouken Ou Beet   Beet the Vandel Buster   

                         Other name Score  \
0                         カウボーイビバップ  8.75   
1                    カウボーイビバップ 天国の扉  8.38   
2                             トライガン  8.22   
3  Witch Hunter ROBIN (ウイッチハンターロビン)  7.25   
4                            冒険王ビィト  6.94   

                                 Genres  \
0         Action, Award Winning, Sci-Fi   
1                        Action, Sci-Fi   
2             Action, Adventure, Sci-Fi   
3  Action, Drama, Mystery, Supernatural   
4      Adventure, Fantasy, Supernatural   

                               

In [12]:
# missing values
print(df.isnull().sum())

anime_id        0
Name            0
English name    0
Other name      0
Score           0
Genres          0
Synopsis        0
Type            0
Episodes        0
Aired           0
Premiered       0
Status          0
Producers       0
Licensors       0
Studios         0
Source          0
Duration        0
Rating          0
Rank            0
Popularity      0
Favorites       0
Scored By       0
Members         0
Image URL       0
dtype: int64


In [13]:
# Data cleaning
df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
df['Episodes'] = pd.to_numeric(df['Episodes'], errors='coerce')
df['Rank'] = pd.to_numeric(df['Rank'], errors='coerce')
df['Scored By'] = pd.to_numeric(df['Scored By'], errors='coerce')

df['Genres'] = df['Genres'].astype(str).str.strip().str.lower()

df['Aired'] = pd.to_datetime(df['Aired'], errors='coerce')

print(df.dtypes)

  df['Aired'] = pd.to_datetime(df['Aired'], errors='coerce')


anime_id                 int64
Name                    object
English name            object
Other name              object
Score                  float64
Genres                  object
Synopsis                object
Type                    object
Episodes               float64
Aired           datetime64[ns]
Premiered               object
Status                  object
Producers               object
Licensors               object
Studios                 object
Source                  object
Duration                object
Rating                  object
Rank                   float64
Popularity               int64
Favorites                int64
Scored By              float64
Members                  int64
Image URL               object
dtype: object


In [14]:
df['Aired_From'] = split_cols[0]

# Only assign Aired_To if second column exists
if split_cols.shape[1] > 1:
    df['Aired_To'] = split_cols[1]
else:
    df['Aired_To'] = None

# Convert both to datetime
df['Aired_From'] = pd.to_datetime(df['Aired_From'], errors='coerce')
df['Aired_To'] = pd.to_datetime(df['Aired_To'], errors='coerce')

# Extract Release Year
df['Release_Year'] = df['Aired_From'].dt.year

# Check the result
print(df[['Aired', 'Aired_From', 'Aired_To', 'Release_Year']].head(10))





       Aired Aired_From Aired_To  Release_Year
0        NaT        NaT      NaT           NaN
1 2001-09-01 2001-09-01      NaT        2001.0
2        NaT        NaT      NaT           NaN
3        NaT        NaT      NaT           NaN
4        NaT        NaT      NaT           NaN
5        NaT        NaT      NaT           NaN
6        NaT        NaT      NaT           NaN
7        NaT        NaT      NaT           NaN
8        NaT        NaT      NaT           NaN
9        NaT        NaT      NaT           NaN


In [15]:
# animes have multiple genres so I am sorting the main anime genre
# Split genres into a list
df['Genre_List'] = df['Genres'].apply(lambda x: [g.strip() for g in x.split(',')])

# Extract the main  genre for simple analysis
df['Main_Genre'] = df['Genre_List'].apply(lambda x: x[0] if len(x) > 0 else None)

print(df[['Name', 'Genres', 'Main_Genre']].head())

                              Name                                Genres  \
0                     Cowboy Bebop         action, award winning, sci-fi   
1  Cowboy Bebop: Tengoku no Tobira                        action, sci-fi   
2                           Trigun             action, adventure, sci-fi   
3               Witch Hunter Robin  action, drama, mystery, supernatural   
4                   Bouken Ou Beet      adventure, fantasy, supernatural   

  Main_Genre  
0     action  
1     action  
2     action  
3     action  
4  adventure  


In [16]:
#here i'm going to remove duplicates and invalid values
df = df.drop_duplicates(subset=['Name'])
df = df[(df['Score'] > 0) & (df['Episodes'] > 0)]
df = df.reset_index(drop=True)

print("✅ After cleaning:")
print(df.shape)

✅ After cleaning:
(15604, 29)


In [17]:
# Info summary
print(df.info())

print(df.head())

# Quick descriptive statistics
print(df[['Score', 'Episodes', 'Rank', 'Members']].describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15604 entries, 0 to 15603
Data columns (total 29 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   anime_id      15604 non-null  int64         
 1   Name          15604 non-null  object        
 2   English name  15604 non-null  object        
 3   Other name    15604 non-null  object        
 4   Score         15604 non-null  float64       
 5   Genres        15604 non-null  object        
 6   Synopsis      15604 non-null  object        
 7   Type          15604 non-null  object        
 8   Episodes      15604 non-null  float64       
 9   Aired         7761 non-null   datetime64[ns]
 10  Premiered     15604 non-null  object        
 11  Status        15604 non-null  object        
 12  Producers     15604 non-null  object        
 13  Licensors     15604 non-null  object        
 14  Studios       15604 non-null  object        
 15  Source        15604 non-null  object

In [18]:
# Export to CSV
df.to_csv("cleaned_anime.csv", index=False)

print("✅ Cleaned data successfully saved to cleaned_anime.csv")


✅ Cleaned data successfully saved to cleaned_anime.csv
