Encode studios column

In [1]:
import pandas as pd
from collections import Counter

In [2]:
df = pd.read_csv('Data/studios.csv')


In [6]:
def count_studios(df, column='studios'):
    # Create a Counter to count all studio occurrences
    studio_counter = Counter()
    
    # Iterate over each row in the 'studios' column
    for studios in df[column]:
        # Split the string by a comma (or other delimiter) to separate studio names
        studio_list = [studio.strip() for studio in studios.split(',')]
        # Update the counter with the list of studios
        studio_counter.update(studio_list)
    
    # Convert the Counter to a DataFrame for better visualization
    studio_counts_df = pd.DataFrame(studio_counter.items(), columns=['Studio', 'Count']).sort_values(by='Count', ascending=False).reset_index(drop=True)
    
    return studio_counts_df

# Apply the function to get the studio counts
studio_counts = count_studios(df, column='studios')
studio_counts.head(15)

Unnamed: 0,Studio,Count
0,Toei Animation,825
1,Sunrise,517
2,J.C.Staff,407
3,Madhouse,360
4,TMS Entertainment,338
5,Production I.G,329
6,Studio Deen,306
7,Pierrot,268
8,OLM,263
9,Shin-Ei Animation,217


In [7]:
top_studios = [
    "Toei Animation", "Sunrise", "J.C.Staff", "Madhouse", 
    "TMS Entertainment", "Production I.G", "Studio Deen", 
    "Pierrot", "OLM", "Shin-Ei Animation", "A-1 Pictures", 
    "Nippon Animation", "AIC", "DLE", "Tatsunoko Production"
]

# Create new columns for each studio and initialize them to False
for studio in top_studios:
    df[f'studio_{studio.replace(" ", "_").lower()}'] = False

# Iterate over each row in the 'studios' column to populate the one-hot encoded columns
for index, row in df.iterrows():
    # Split the row's studios entry by comma and strip spaces
    studios_in_row = [studio.strip() for studio in row['studios'].split(',')]
    
    # Set True in the respective columns if the studio is present in the row
    for studio in top_studios:
        column_name = f'studio_{studio.replace(" ", "_").lower()}'
        if studio in studios_in_row:
            df.at[index, column_name] = True

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13649 entries, 0 to 13648
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   anime_id                     13649 non-null  int64  
 1   title                        13649 non-null  object 
 2   episodes                     13649 non-null  float64
 3   studios                      13649 non-null  object 
 4   score                        13649 non-null  float64
 5   popularity                   13649 non-null  object 
 6   rank                         13649 non-null  object 
 7   studio_toei_animation        13649 non-null  bool   
 8   studio_sunrise               13649 non-null  bool   
 9   studio_j.c.staff             13649 non-null  bool   
 10  studio_madhouse              13649 non-null  bool   
 11  studio_tms_entertainment     13649 non-null  bool   
 12  studio_production_i.g        13649 non-null  bool   
 13  studio_studio_de

In [8]:
df.drop(columns = 'studios', inplace = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13649 entries, 0 to 13648
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   anime_id                     13649 non-null  int64  
 1   title                        13649 non-null  object 
 2   episodes                     13649 non-null  float64
 3   score                        13649 non-null  float64
 4   popularity                   13649 non-null  object 
 5   rank                         13649 non-null  object 
 6   studio_toei_animation        13649 non-null  bool   
 7   studio_sunrise               13649 non-null  bool   
 8   studio_j.c.staff             13649 non-null  bool   
 9   studio_madhouse              13649 non-null  bool   
 10  studio_tms_entertainment     13649 non-null  bool   
 11  studio_production_i.g        13649 non-null  bool   
 12  studio_studio_deen           13649 non-null  bool   
 13  studio_pierrot  

In [10]:
df.to_csv('studios.csv', index = False)