## Examples - AutoAD

In [14]:
%pip install kaggle

Note: you may need to restart the kernel to use updated packages.


You can find the API Username and Key in your Kaggle Account under:
1. Settings
2. API - Create New Token 

In [15]:
import os
from dotenv import load_dotenv

# Create dataset directory if it doesnt exist
if not os.path.exists("datasets"):
    os.makedirs("datasets")

# Load API-Credentials
load_dotenv(dotenv_path="../.env")

kaggle_username = os.getenv("KAGGLE_USERNAME")
kaggle_password = os.getenv("KAGGLE_KEY")

In [16]:
import kaggle
kaggle.api.authenticate()

### IMDB Movie Reviews - Example

In [17]:
kaggle.api.dataset_download_files('harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows', path='./datasets', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows


In [18]:

from pyod.models.iforest import IForest
import pandas as pd
from sklearn import set_config

from autoad.autoad import AutoAD

set_config(transform_output="pandas")

X = pd.read_csv("./datasets/imdb_top_1000.csv")


pipeline_ad = AutoAD()
pipeline_ad.fit(X=X, clf_ad=IForest(random_state=2))


X_output = pipeline_ad.transform(X=X)

In [19]:
pipeline_ad.feature_importances

Unnamed: 0,Feature,Importance
3,Meta_score,0.013148
51,categorical__Genre_6,0.011810
105,categorical__Star4_2,0.011657
26,categorical__Released_Year_1,0.011553
4,No_of_Votes,0.011525
...,...,...
134,missingindicator_Star2,0.000000
135,missingindicator_Star3,0.000000
136,missingindicator_Star4,0.000000
137,missingindicator_No_of_Votes,0.000000


In [20]:
X_output.head(15)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross,AD_score,MAD_Total,Tukey_Total
867,https://m.media-amazon.com/images/M/MV5BYTExYj...,Invasion of the Body Snatchers,1956,Approved,80 min,"Drama, Horror, Sci-Fi",7.7,A small-town doctor learns that the population...,92.0,Don Siegel,Kevin McCarthy,Dana Wynter,Larry Gates,King Donovan,44839,,0.019019,1,0
991,https://m.media-amazon.com/images/M/MV5BMzAyND...,Kelly's Heroes,1970,GP,144 min,"Adventure, Comedy, War",7.6,A group of U.S. soldiers sneaks across enemy l...,50.0,Brian G. Hutton,Clint Eastwood,Telly Savalas,Don Rickles,Carroll O'Connor,45338,1378435.0,0.017185,1,0
942,https://m.media-amazon.com/images/M/MV5BODNiZm...,The Butterfly Effect,2004,U,113 min,"Drama, Sci-Fi, Thriller",7.6,Evan Treborn suffers blackouts during signific...,30.0,Eric Bress,J. Mackye Gruber,Ashton Kutcher,Amy Smart,Melora Walters,451479,57938693.0,0.015501,1,1
909,https://m.media-amazon.com/images/M/MV5BMjI2OD...,Celda 211,2009,,113 min,"Action, Adventure, Crime",7.6,The story of two men on different sides of a p...,,Daniel Monzón,Luis Tosar,Alberto Ammann,Antonio Resines,Manuel Morón,63882,,0.015366,1,0
566,https://m.media-amazon.com/images/M/MV5BZTY3Yj...,King Kong,1933,Passed,100 min,"Adventure, Horror, Sci-Fi",7.9,A film crew goes to a tropical island for an e...,90.0,Merian C. Cooper,Ernest B. Schoedsack,Fay Wray,Robert Armstrong,Bruce Cabot,78991,10000000.0,0.015299,1,0
924,https://m.media-amazon.com/images/M/MV5BMTI5Mj...,Huo Yuan Jia,2006,PG-13,104 min,"Action, Biography, Drama",7.6,A biography of Chinese Martial Arts Master Huo...,70.0,Ronny Yu,Jet Li,Li Sun,Yong Dong,Yun Qu,72863,24633730.0,0.012414,1,0
568,https://m.media-amazon.com/images/M/MV5BMTAxYj...,Nosferatu,1922,,94 min,"Fantasy, Horror",7.9,Vampire Count Orlok expresses interest in a ne...,,F.W. Murnau,Max Schreck,Alexander Granach,Gustav von Wangenheim,Greta Schröder,88794,,0.012252,1,0
913,https://m.media-amazon.com/images/M/MV5BMzc0Zm...,Die Welle,2008,,107 min,"Drama, Thriller",7.6,A high school teacher's experiment to demonstr...,,Dennis Gansel,Jürgen Vogel,Frederick Lau,Max Riemelt,Jennifer Ulrich,102742,,0.011459,1,0
959,https://m.media-amazon.com/images/M/MV5BMGExOG...,Dark City,1998,A,100 min,"Mystery, Sci-Fi, Thriller",7.6,"A man struggles with memories of his past, whi...",66.0,Alex Proyas,Rufus Sewell,Kiefer Sutherland,Jennifer Connelly,William Hurt,187927,14378331.0,0.010952,1,1
495,https://m.media-amazon.com/images/M/MV5BMzQ5NG...,The Man from Earth,2007,,87 min,"Drama, Fantasy, Mystery",7.9,An impromptu goodbye party for Professor John ...,,Richard Schenkman,David Lee Smith,Tony Todd,John Billingsley,Ellen Crawford,174125,,0.010801,1,1


### FIFA - Example

In [21]:
# FIFA - Dataset
kaggle.api.dataset_download_files('maso0dahmed/football-players-data', path='./datasets', unzip=True)

from pyod.models.pca import PCA
import pandas as pd
from sklearn import set_config

from autoad.autoad import AutoAD

set_config(transform_output="pandas")


X = pd.read_csv("./datasets/fifa_players.csv")

pipeline_ad = AutoAD()
pipeline_ad.fit(X=X, clf_ad=PCA())

X_output = pipeline_ad.transform(X=X)

X_output.head(5)

Dataset URL: https://www.kaggle.com/datasets/maso0dahmed/football-players-data


Unnamed: 0,name,full_name,birth_date,age,height_cm,weight_kgs,positions,nationality,overall_rating,potential,...,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle,AD_score,MAD_Total,Tukey_Total
6618,A. Singh,Amrinder Singh,5/27/1993,25,185.42,81.2,GK,India,58,62,...,12,36,30,57,16,17,17,9.769237e+36,3,3
6894,K. Malinov,Kristiyan Malinov,3/30/1994,24,170.18,64.9,CM,Bulgaria,59,65,...,53,59,42,61,53,61,59,9.742714000000001e+36,3,3
4163,A. Thapa,Anirudh Thapa,1/15/1998,21,170.18,64.0,"CM,CDM",India,59,71,...,42,56,52,60,47,56,49,9.741943000000001e+36,3,3
17944,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,2/5/1985,34,187.96,83.0,"ST,LW",Portugal,94,94,...,95,82,85,95,28,31,23,7.609604e+36,4,4
17943,Neymar Jr,Neymar da Silva Santos Junior,2/5/1992,27,175.26,68.0,"LW,CAM",Brazil,92,92,...,89,88,81,94,27,24,33,7.565e+36,4,4


### NFL Football Player Stats - Example

In [22]:
# NFL Football Player Stats - Dataset
kaggle.api.dataset_download_files('zynicide/nfl-football-player-stats', path='./datasets', unzip=True)

from pyod.models.pca import PCA
import pandas as pd
from sklearn import set_config

from autoad.autoad import AutoAD

set_config(transform_output="pandas")


X = pd.read_json("./datasets/profiles_1512362725.022629.json")
pipeline_ad = AutoAD()
pipeline_ad.fit(X=X, clf_ad=PCA())

X_output = pipeline_ad.transform(X=X)

X_output.head(5)

Dataset URL: https://www.kaggle.com/datasets/zynicide/nfl-football-player-stats


Unnamed: 0,player_id,name,position,height,weight,current_team,birth_date,birth_place,death_date,college,high_school,draft_team,draft_round,draft_position,draft_year,current_salary,hof_induction_year,AD_score,MAD_Total,Tukey_Total
24703,22945,Alejandro Villanueva,OL,6-9,277.0,Pittsburgh Steelers,1988-09-22,"Maridian, MS",,Army,"Shape, Belgium",,,,,1500000,,3.610718e+37,2,1
22607,2544,Antonio Brown,WR,5-10,180.0,Pittsburgh Steelers,1988-07-10,"Miami, FL",,Central Michigan,"Norland, FL",Pittsburgh Steelers,6.0,195.0,2010.0,910000,,3.597738e+37,2,0
21081,1465,Le'Veon Bell,RB,6-2,230.0,Pittsburgh Steelers,1992-02-18,"Reynoldsburg, OH",,Michigan St.,"Groveport Madison, OH",Pittsburgh Steelers,2.0,48.0,2013.0,12120000,,3.594448e+37,2,0
17100,18977,Ben Roethlisberger,QB,6-5,240.0,Pittsburgh Steelers,1982-03-02,"Lima, OH",,Miami (OH),"Findlay, OH",Pittsburgh Steelers,1.0,11.0,2004.0,12000000,,3.513509e+37,2,1
17856,11648,Landry Jones,QB,6-4,225.0,Pittsburgh Steelers,1989-04-04,"Artesia, NM",,Oklahoma,"Artesia, NM",Pittsburgh Steelers,4.0,115.0,2013.0,1900000,,3.508909e+37,2,1


### Titanic - Example

In [23]:
# NFL Football Player Stats - Dataset
kaggle.api.dataset_download_files('brendan45774/test-file', path='./datasets', unzip=True)

from pyod.models.pca import PCA
import pandas as pd
from sklearn import set_config

from autoad.autoad import AutoAD

set_config(transform_output="pandas")


X = pd.read_csv("./datasets/tested.csv")
pipeline_ad = AutoAD()
pipeline_ad.fit(X=X, clf_ad=PCA())

X_output = pipeline_ad.transform(X=X)

X_output.head(25)

Dataset URL: https://www.kaggle.com/datasets/brendan45774/test-file


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,AD_score,MAD_Total,Tukey_Total
152,1044,0,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,4.171676e+35,2,1
343,1235,1,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",female,58.0,0,1,PC 17755,512.3292,B51 B53 B55,C,2.55455e+35,3,2
365,1257,1,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S,2.134474e+35,2,2
342,1234,0,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S,2.095256e+35,2,2
188,1080,1,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S,2.0822729999999997e+35,2,2
96,988,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1,0,19877,78.85,C46,S,2.0748149999999997e+35,3,3
360,1252,0,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S,2.0452579999999998e+35,2,2
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,2.024485e+35,3,2
69,961,1,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60.0,1,4,19950,263.0,C23 C25 C27,S,1.978308e+35,3,2
390,1282,0,1,"Payne, Mr. Vivian Ponsonby",male,23.0,0,0,12749,93.5,B24,S,1.946221e+35,3,2


## ! Only Dataset Transformation without anomaly detection

In [None]:
X = pd.read_csv("./datasets/tested.csv")
pipeline_ad = AutoAD()
pipeline_ad.fit(X=X)

X_output = pipeline_ad.transform(X=X)

X_output.head(5)

Unnamed: 0,Tukey_Total,MAD_Total,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,categorical__Name_0,...,missingindicator_Pclass,missingindicator_Name,missingindicator_Sex,missingindicator_Age,missingindicator_SibSp,missingindicator_Parch,missingindicator_Ticket,missingindicator_Fare,missingindicator_Cabin,missingindicator_Embarked
0,1,2,892.0,0.0,3.0,34.5,0.0,0.0,7.8292,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,2,893.0,1.0,3.0,47.0,1.0,0.0,7.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,2,894.0,0.0,2.0,62.0,0.0,0.0,9.6875,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,2,895.0,0.0,3.0,27.0,0.0,0.0,8.6625,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,2,896.0,1.0,3.0,22.0,1.0,1.0,12.2875,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
