### Import Libraries

In [55]:
import pandas as pd
import pyodbc
import numpy as np
import ast

### Load cleaned CSV files

In [56]:
anime_df = pd.read_csv('../data/cleaned_anime.csv')
ratings_df = pd.read_csv('../data/cleaned_ratings.csv')

In [57]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12269 entries, 0 to 12268
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   anime_id    12269 non-null  int64  
 1   name        12269 non-null  object 
 2   genre       12269 non-null  object 
 3   type        12269 non-null  object 
 4   episodes    12269 non-null  object 
 5   rating      12064 non-null  float64
 6   members     12269 non-null  int64  
 7   genre_list  12269 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 766.9+ KB


In [58]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813736 entries, 0 to 7813735
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [59]:
anime_values_df = anime_df.copy()

In [60]:
anime_values_df.drop(['genre','genre_list'],axis=1,inplace=True)
anime_values_df.reset_index(drop=True,inplace=True)


In [61]:
# Strip spaces and convert invalid entries to NaN
anime_values_df['rating'] = anime_values_df['rating'].astype(str).str.strip()
anime_values_df['rating'] = anime_values_df['rating'].replace(
    ['', ' ', 'nan', 'NaN', 'None', 'NONE', 'null', 'NULL', '?'], 
    np.nan
)
# Convert to float
anime_values_df['rating'] = pd.to_numeric(anime_values_df['rating'], errors='coerce')
# Replace NaN with Python None for SQL insertion
anime_values_df['rating'] = anime_values_df['rating'].where(pd.notnull(anime_df['rating']), None)


### Connect to sql server

In [62]:
conn = pyodbc.connect(
    'DRIVER={ODBC Driver 17 for SQL Server};'
    'SERVER=LAPTOP-RES15OPT\SQLEXPRESS;'
    'DATABASE=AnimeDB;'
    'Trusted_Connection=yes;'
)

In [63]:
cursor = conn.cursor()

In [64]:
cursor.execute("select DB_NAME()")
print(cursor.fetchone())

('AnimeDB',)


### Insert into Anime Table

In [81]:

cursor.fast_executemany = True

anime_values = list(anime_df[['anime_id','name','type','episodes','rating','members']].itertuples(index=False, name=None))
count = 0 
for row in anime_values:
    try:
        cursor.execute("""
           truncate table anime
        """, row)
    except Exception as e:
        print(f" Problem row:{count+1}", row)
        print("Error:", e)
        continue  # skip bad row

conn.commit()
print(" Anime table ETL complete!")


 Problem row:1 (32281, 'Kimi no Na wa.', 'Movie', '1', 9.37, 200630)
Error: ('The SQL contains 0 parameter markers, but 6 parameters were supplied', 'HY000')
 Problem row:1 (5114, 'Fullmetal Alchemist: Brotherhood', 'TV', '64', 9.26, 793665)
Error: ('The SQL contains 0 parameter markers, but 6 parameters were supplied', 'HY000')
 Problem row:1 (28977, 'Gintama°', 'TV', '51', 9.25, 114262)
Error: ('The SQL contains 0 parameter markers, but 6 parameters were supplied', 'HY000')
 Problem row:1 (9253, 'Steins;Gate', 'TV', '24', 9.17, 673572)
Error: ('The SQL contains 0 parameter markers, but 6 parameters were supplied', 'HY000')
 Problem row:1 (9969, 'Gintama&#039;', 'TV', '51', 9.16, 151266)
Error: ('The SQL contains 0 parameter markers, but 6 parameters were supplied', 'HY000')
 Problem row:1 (32935, 'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou', 'TV', '10', 9.15, 93351)
Error: ('The SQL contains 0 parameter markers, but 6 parameters were supplied', 'HY000')
 Problem row:1 

### Insert into genres table

In [76]:
anime_df['genre_list'] = anime_df['genre_list'].apply(ast.literal_eval)

all_genres = anime_df.explode('genre_list').reset_index(drop=True)
genres_df = pd.DataFrame({'genre_name':all_genres['genre_list']})
genres_df.head()

Unnamed: 0,genre_name
0,drama
1,romance
2,school
3,supernatural
4,action


In [77]:
genres_df['genre_name'] = genres_df['genre_name'].str.strip()
genres_df.drop_duplicates(ignore_index=True,inplace=True)

In [78]:
genres_df= genres_df.sort_values(by='genre_name',axis=0)

In [79]:
genres_df.reset_index(drop=True,inplace=True)
genres_df

Unnamed: 0,genre_name
0,action
1,adventure
2,cars
3,comedy
4,dementia
5,demons
6,drama
7,ecchi
8,fantasy
9,game


In [None]:
print("inserting genres...")

genre_values = list(genres_df.itertuples(index=False,name=None))

cursor.executemany(
    '''
        INSERT INTO Genre(genre_name)
        VALUES (?)
    ''',genre_values
)

conn.commit()

inserting genres...


ProgrammingError: ('42000', "[42000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Cannot truncate table 'genres' because it is being referenced by a FOREIGN KEY constraint. (4712) (SQLExecDirectW)")

In [82]:
cursor.execute('select * from genres')
cursor.fetchall()

[(1, "['action', ' adventure', ' cars', ' comedy', ' sci-fi', ' shounen']"),
 (2, "['action', ' adventure', ' cars', ' mecha', ' sci-fi', ' shounen', ' sports']"),
 (3, "['action', ' adventure', ' cars', ' sci-fi']"),
 (4, "['action', ' adventure', ' comedy', ' demons', ' drama', ' ecchi', ' horror', ' mystery', ' romance', ' sci-fi']"),
 (5, "['action', ' adventure', ' comedy', ' demons', ' fantasy', ' magic', ' romance', ' shounen', ' supernatural']"),
 (6, "['action', ' adventure', ' comedy', ' demons', ' fantasy', ' magic']"),
 (7, "['action', ' adventure', ' comedy', ' demons', ' fantasy', ' martial arts', ' shounen', ' super power']"),
 (8, "['action', ' adventure', ' comedy', ' demons', ' magic', ' super power']"),
 (9, "['action', ' adventure', ' comedy', ' demons', ' shounen', ' supernatural']"),
 (10, "['action', ' adventure', ' comedy', ' demons', ' supernatural', ' vampire']"),
 (11, "['action', ' adventure', ' comedy', ' drama', ' ecchi', ' fantasy', ' harem', ' magic', ' 