In [1]:
import re
import pandas as pd
import numpy as np

<head><h4>Functions:</h4></head><BR>
<div>*fix_genre - receives the dataframe with the unorganized genres column, creates a column for each genre from a list, runs a regex on the existing rows to seperate the genres that start with a capital letter, after that I mark the column of each genre with 1's</div><BR>
    <div>*numerize_col - receives the dataframe and a column name, turns said column from categorical to numerical value</div>

In [2]:
def fix_genre(df):
    pattern = '[A-Z][^A-Z]*'
    genre_list = ['Action','Adventure','Comedy','Drama','Ecchi','Fantasy', 'Horror', 'Mahou Shoujo', 'Mecha', 'Music','Mystery'
                 ,'Psychological', 'Romance','Sci-Fi', 'Slice of Life', 'Sports','Supernatural','Thriller']
    for genre in genre_list:
        df[genre] = 0
    row_index = 0
    for genre in df['Genres']:
        check_list = re.findall(pattern, genre)
        for check in check_list:
            if(check == None or check == 'Shoujo' or check =='Fi' or check=='Life'):
                continue
            if(check == 'Mahou '):
                check += 'Shoujo'
            if(check == 'Sci-'):
                check += 'Fi'
            if(check == 'Slice of '):
                check += 'Life'
            if(check in df.columns):
                df.at[row_index, check] = 1
        row_index +=1
    return df

In [3]:
def numerize_col(df, col_name):
    value_list, unique_list = df[col_name].factorize()
    df[col_name] = value_list
    return df

<h4>Main Body:</h4>
    <p>The main purpose of the body of the function is the cleaning of the dataframe and converting none numerical values, I start by getting the dataframes from two csvs and combining them into one dataframe, after that, I start by dropping unnecessary columns or those with too many missing values. in the next stage after checking which rows has too many missing values I get rid of them aswell. I fill the missing 'Source' rows with 'Other'.
    
after cleaning the dataframe of missing values, I reset the indexes, and send the dataframe to fix_genre function to get a new dataframe with organized genre columns, after that I drop the old 'Genres' column.

afterwards I clean 'Mean Score' and 'Format' of unnecessary strings and convert them to numerical values using numerize_col function.

Lastly I convert all columns from whatever dtypes they were to int64 and save the clean dataframe in a new .csv file.</p>

In [4]:
df_1 = pd.read_csv('./Anime_Data_Frame_1')
df_2 = pd.read_csv('./Anime_Data_Frame_2')

In [9]:
complete_df = df_1.append(df_2)

<div>Dropping unnecessary columns</div>

In [None]:
drop_list = ['Unnamed: 0','Average Score','Duration', 'English','EpisodeDuration','Episodes','Studios','Hashtag','Synonyms', 'Producers','Status','Romaji','Native']
col_dropped_df = complete_df.drop(drop_list, axis = 1)

In [None]:
col_dropped_df

<div>Checking null values, and dropping rows accordingly.

I prefer to fill Source rather than dropping those extra rows because all unknown values can be filled with 'Other'</div>

In [None]:
col_dropped_df.isnull().sum()

In [None]:
row_drop_list = ['Mean Score','Popularity','Favorites','Genres']
preprocessed_df = col_dropped_df.dropna(axis = 0, subset = row_drop_list)


In [None]:
preprocessed_df.isnull().sum()

In [None]:
preprocessed_df['Source'].fillna('Other', inplace = True)
preprocessed_df.isnull().sum()

In [None]:
preprocessed_df['Source'].unique()

In [None]:
preprocessed_df.isnull().sum()

In [None]:
preprocessed_df.reset_index(drop=True, inplace=True)
preprocessed_df['Format'].unique()

In [None]:
format_dict = {k: v for v, k in enumerate(preprocessed_df['Format'].unique())}

In [None]:
format_dict ##to remember each formats number

In [None]:
genred_df = fix_genre(preprocessed_df)
genred_df = genred_df.drop('Genres', axis = 1)

<p>'Mean Score' and 'Format' have unnecessary characters that are in the way of numerizing the values, so we will clean them up before turning them into numbers.</p>

In [None]:
genred_df['Mean Score'] = genred_df['Mean Score'].str.replace("%","")
genred_df['Format'] = genred_df['Format'].str.replace("/n","").replace("\n","")

In [None]:
preprocessed_df['Format'].unique()

In [None]:
numerized_df = numerize_col(genred_df, 'Source')
numerized_df = numerize_col(numerized_df, 'Format')
numerized_df

In [None]:
numerized_df = numerized_df.astype({'Favorites': 'int64', 'Popularity': 'int64', 'Mean Score': 'int64'}, errors='raise')
numerized_df.dtypes

In [None]:
numerized_df

In [None]:
numerized_df.to_csv('clean_anime_dataframe', index_label = False)