# V.1 DataAnalysis

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("dataset_train.csv")

### This should be the output of our first program
It reads the datasets and gives us count, mean, std, min, max, percentiles for each feature

In [3]:
df.describe()

Unnamed: 0,Index,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1600.0,1566.0,1568.0,1567.0,1569.0,1561.0,1565.0,1565.0,1557.0,1566.0,1570.0,1560.0,1600.0,1600.0
mean,799.5,49634.570243,39.797131,1.14102,-0.387863,3.15391,-224.589915,495.74797,2.963095,1030.096946,5.950373,-0.053427,-243.374409,21.958012
std,462.02453,16679.806036,520.298268,5.219682,5.212794,4.155301,486.34484,106.285165,4.425775,44.125116,3.147854,0.971457,8.78364,97.631602
min,0.0,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25%,399.75,38511.5,-489.551387,-4.308182,-5.259095,3.099,-577.580096,397.511047,2.218653,1026.209993,3.646785,-0.671606,-250.6526,-41.87
50%,799.5,49013.5,260.289446,3.469012,-2.589342,4.624,-419.164294,463.918305,4.378176,1045.506996,5.874837,-0.044811,-244.867765,-2.515
75%,1199.25,60811.25,524.771949,5.419183,4.90468,5.667,254.994857,597.49223,5.825242,1058.43641,8.248173,0.589919,-232.552305,50.56
max,1599.0,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1098.958201,13.536762,3.056546,-225.42814,279.07


In [4]:
def format_decimal(number, decimal):
    return format(number, f'.{decimal}f')

In [5]:
"""
    We need a list of the features to describe. 
    All the non-numerical features can be left out. 
"""

features = list(df.columns)

features = [f for f in features if np.issubdtype(df[f].dtype, np.number)]

print(features)

['Index', 'Arithmancy', 'Astronomy', 'Herbology', 'Defense Against the Dark Arts', 'Divination', 'Muggle Studies', 'Ancient Runes', 'History of Magic', 'Transfiguration', 'Potions', 'Care of Magical Creatures', 'Charms', 'Flying']


In [6]:
def count(feature : str) -> int:
    """
       Count non-NA (NaN, NaT, None) cells for the requested feature.
    """
    return len(df[feature].dropna())


print("COUNT:\n")
for feature in features:
    print(feature, ":", format_decimal(count(feature), 6))

COUNT:

Index : 1600.000000
Arithmancy : 1566.000000
Astronomy : 1568.000000
Herbology : 1567.000000
Defense Against the Dark Arts : 1569.000000
Divination : 1561.000000
Muggle Studies : 1565.000000
Ancient Runes : 1565.000000
History of Magic : 1557.000000
Transfiguration : 1566.000000
Potions : 1570.000000
Care of Magical Creatures : 1560.000000
Charms : 1600.000000
Flying : 1600.000000


In [7]:
def mean(feature : str) -> float:
    """
        Returns the mean for the values of the requested feature.
    """
    return (sum(df[feature].dropna()) / count(feature))

print("MEAN :\n")
for feature in features:
    print(feature, ":", format_decimal(mean(feature),6))

MEAN :

Index : 799.500000
Arithmancy : 49634.570243
Astronomy : 39.797131
Herbology : 1.141020
Defense Against the Dark Arts : -0.387863
Divination : 3.153910
Muggle Studies : -224.589915
Ancient Runes : 495.747970
History of Magic : 2.963095
Transfiguration : 1030.096946
Potions : 5.950373
Care of Magical Creatures : -0.053427
Charms : -243.374409
Flying : 21.958012


In [8]:
def variance(feature : str) -> float:
    mean_val = mean(feature)
    numerator = 0
    for i in df[feature].dropna():
        numerator += (i - mean_val) ** 2
    return (numerator/(count(feature) - 1))

def std(feature : str) -> float:
    """
        Returns sample standard deviation for the requested feature.
    """
    return math.sqrt(variance(feature))

print("STD :\n")
for feature in features:
    print(feature, ":", format_decimal(std(feature),6))

STD :

Index : 462.024530
Arithmancy : 16679.806036
Astronomy : 520.298268
Herbology : 5.219682
Defense Against the Dark Arts : 5.212794
Divination : 4.155301
Muggle Studies : 486.344840
Ancient Runes : 106.285165
History of Magic : 4.425775
Transfiguration : 44.125116
Potions : 3.147854
Care of Magical Creatures : 0.971457
Charms : 8.783640
Flying : 97.631602


In [9]:
def ft_min(feature : str) -> float:
    min = np.nan
    for i in df[feature].dropna():
        min = min if i > min else i 
    return min

print("MIN :\n")
for feature in features:
    print(feature, ":", format_decimal(ft_min(feature),6))

MIN :

Index : 0.000000
Arithmancy : -24370.000000
Astronomy : -966.740546
Herbology : -10.295663
Defense Against the Dark Arts : -10.162119
Divination : -8.727000
Muggle Studies : -1086.496835
Ancient Runes : 283.869609
History of Magic : -8.858993
Transfiguration : 906.627320
Potions : -4.697484
Care of Magical Creatures : -3.313676
Charms : -261.048920
Flying : -181.470000


In [10]:
def ft_max(feature : str) -> float:
    max = np.nan
    for i in df[feature].dropna():
        max = max if i < max else i 
    return max

print("MAX :\n")
for feature in features:
    print(feature, ":", format_decimal(ft_max(feature),6))

MAX :

Index : 1599.000000
Arithmancy : 104956.000000
Astronomy : 1016.211940
Herbology : 11.612895
Defense Against the Dark Arts : 9.667405
Divination : 10.032000
Muggle Studies : 1092.388611
Ancient Runes : 745.396220
History of Magic : 11.889713
Transfiguration : 1098.958201
Potions : 13.536762
Care of Magical Creatures : 3.056546
Charms : -225.428140
Flying : 279.070000


In [11]:
def percentile(feature : str, percent : int) -> float:
    arr = np.array(df[feature].dropna())
    arr.sort()
    l = count(feature)/100
    return arr[int(percent * l)]

print("Percentiles (25, 50, 75) :\n")
for feature in features:
    print(feature, ":", format_decimal(percentile(feature, 25),6))
    print(feature, ":", format_decimal(percentile(feature, 50),6))
    print(feature, ":", format_decimal(percentile(feature, 75),6))

Percentiles (25, 50, 75) :

Index : 400.000000
Index : 800.000000
Index : 1200.000000
Arithmancy : 38510.000000
Arithmancy : 49018.000000
Arithmancy : 60828.000000
Astronomy : -489.493777
Astronomy : 261.644731
Astronomy : 525.909540
Herbology : -4.312118
Herbology : 3.469012
Herbology : 5.421046
Defense Against the Dark Arts : -5.259095
Defense Against the Dark Arts : -2.589342
Defense Against the Dark Arts : 4.904680
Divination : 3.099000
Divination : 4.624000
Divination : 5.667000
Muggle Studies : -577.580096
Muggle Studies : -419.164294
Muggle Studies : 254.994857
Ancient Runes : 397.511047
Ancient Runes : 463.918305
Ancient Runes : 597.492230
History of Magic : 2.218653
History of Magic : 4.378176
History of Magic : 5.825242
Transfiguration : 1026.171713
Transfiguration : 1045.533335
Transfiguration : 1058.445388
Potions : 3.644900
Potions : 5.877582
Potions : 8.248789
Care of Magical Creatures : -0.670996
Care of Magical Creatures : -0.043296
Care of Magical Creatures : 0.594446


In [63]:
from texttable import Texttable
from functools import partial
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

func_list = [count, mean, std, ft_min, partial(percentile, percent=25),\
             partial(percentile, percent=25), partial(percentile, percent=25),  ft_max]
columns = [["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]
rows = []
rows.append(features)

for func in func_list:
    rows.append(list(map(func, features)))

# test = [list(x) for x  in zip(*rows)] #tranpose the 2d array

table = Texttable()
table.set_deco(Texttable.HEADER)
col_type = ["f"] * 14
col_width = [13] * 14
col_align = ["r"] * 14
table.set_cols_width(col_width)
table.set_cols_dtype(col_type)
table.set_cols_align(col_align)
table.set_precision(6)
table.add_rows(rows)
table.set_max_width(500)
print(table.draw())
# print(tabulate(rows, headers=features, maxcolwidths=[None, 5]))

    Index        Arithmancy       Astronomy       Herbology        Defense       Divination        Muggle       Ancient Runes    History of     Transfigurati      Potions         Care of         Charms          Flying    
                                                                 Against the                       Studies                          Magic            on                            Magical                                   
                                                                  Dark Arts                                                                                                       Creatures                                  
  1600.000000     1566.000000     1568.000000     1567.000000     1569.000000     1561.000000     1565.000000     1565.000000     1557.000000     1566.000000     1570.000000     1560.000000     1600.000000     1600.000000
   799.500000    49634.570243       39.797131        1.141020       -0.387863        3.153910     -224.589915   

In [96]:
import plotly.graph_objects as go

headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'

func_list = [count, mean, std, ft_min, partial(percentile, percent=25),\
             partial(percentile, percent=50), partial(percentile, percent=75),  ft_max]

vals = [] 

for func in func_list:
    vals.append(list(map(func, features)))

#tranpose the 2d array
vals = [list(x) for x  in zip(*vals)]

rounded_vals = []
for i in vals:
    tmp = []
    for j in i:
        tmp.append(format_decimal(j, 6))
    rounded_vals.append(tmp)
    
rounded_vals.insert(0,["count", "mean", "std", "min", "25%", "50%", "75%", "max"])

fig = go.Figure(data=[go.Table(
  header=dict(
    values=['',*features],
    line_color='darkslategray',
    fill_color=headerColor,
    align=['left','center'],
    font=dict(color='Black', size=12)
  ),
  cells=dict(
    values=rounded_vals,
    line_color='darkslategray',
    # 2-D list of colors for alternating rows
    fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor, rowEvenColor,rowOddColor,  rowEvenColor]],
    align = ['left', 'center'],
    font = dict(color = 'darkslategray', size = 11)
    ))
])

fig.show()