## Describe function

Import the necessary libraries

In [3]:
import pandas as pd
import numpy as np

#### Data importation and preparation

Import the data from the csv files

In [128]:
train_data = pd.read_csv("dataset_train.csv")
test_data = pd.read_csv("dataset_test.csv")
full_data = pd.concat([train_data, test_data], axis=0)
full_data.head(3)

Unnamed: 0,Index,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42


We drop the index column

In [129]:
full_data = full_data.drop(['Index'], axis=1)

#### Describe function

We are going to manually code the following function

In [130]:
full_data.describe()

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1953.0,1955.0,1956.0,1961.0,1955.0,1955.0,1957.0,1946.0,1955.0,1960.0,1952.0,2000.0,2000.0
mean,49724.612903,41.451669,1.189644,-0.417844,3.205737,-223.708112,495.785943,2.936453,1030.253905,5.915848,-0.038082,-243.335749,22.2399
std,16444.807977,518.651589,5.187046,5.193183,4.103768,488.49943,105.348963,4.40351,44.360287,3.161791,0.981402,8.770542,97.12546
min,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25%,38819.0,-488.265275,-4.300239,-5.243928,3.1825,-577.927473,398.497892,2.219926,1026.510177,3.554956,-0.660419,-250.58645,-40.7125
50%,49114.0,272.071636,3.50684,-2.70741,4.634,-415.425616,467.730624,4.355191,1045.78509,5.857253,-0.025599,-244.831995,-2.515
75%,60698.0,521.974961,5.416,4.881403,5.655,252.532155,596.599814,5.780673,1058.649546,8.236317,0.621809,-232.59871,49.4575
max,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1099.966073,13.536762,3.205525,-225.42814,282.43


We keep only the columns which are interesting for us

In [209]:
class describing:
    
    def __init__(self):
        self = self
        
    def __counter(self, X, col):
        Y = pd.DataFrame(X.filter([str(col)]))
        Y.columns = ['col']
        Y = Y.dropna().reset_index(drop=True)
        Y['One'] = 1
        return sum(Y['One'])
    
    def __meaner(self,X,col):
        return sum(X[col].dropna())/self.__counter(X, col)
    
    def __count(self, X, col):
        return self.__counter(X,col)
    
    def __mean(self,X,col):
        return self.__meaner(X,col)
    
    def __std(self, X, col):
        New = pd.DataFrame(X[col].dropna().reset_index(drop=True))
        std = pd.DataFrame((New[col] - self.__meaner(New,col))**2)
        return np.sqrt(1/(self.__counter(std,col))*sum(std[col]))
    
    def __minimum(self, X, col):
        Y = pd.DataFrame(X[col])
        m = Y[col].iloc[0]
        for i in np.arange(1, len(Y[col])):
            if (m > Y[col].iloc[i]): m = Y[col].iloc[i]
        return m
    
    def __maximum(self, X, col):
        Y = pd.DataFrame(X[col])
        ma = Y[col].iloc[0]
        for i in np.arange(1, len(Y[col])):
            if (ma < Y[col].iloc[i]): ma = Y[col].iloc[i]
        return ma
    
    def __quartile(self,X,col,percentage):
        quart = pd.DataFrame(X[col].dropna().reset_index(drop=True))
        quart = quart.sort_values(by=[col], ascending=True, axis=0).reset_index(drop=True)
        return quart[col].iloc[int((len(quart[col])*percentage/100))]
    

    def describe(self,X):
        results = pd.DataFrame(index=['Count','Mean','std','min','25%','50%','75%','max'])
        mat = X._get_numeric_data()
        for name in list(mat.columns):
            
            metrics = []
    
            metrics.append(float(self.__count(X = mat, col = name)))
            metrics.append(self.__mean(X = mat, col = name))
            metrics.append(self.__std(X = mat, col = name))
            metrics.append(self.__minimum(X = mat, col = name))
            metrics.append(self.__quartile(X = mat, col = name, percentage=25))
            metrics.append(self.__quartile(X = mat, col = name, percentage=50))
            metrics.append(self.__quartile(X = mat, col = name, percentage=75))
            metrics.append(self.__maximum(X = mat, col = name))
        
            metrics = pd.DataFrame(metrics)
            metrics.columns = [name]
            metrics.index = ['Count','Mean','std','min','25%','50%','75%','max']
            results = pd.concat([results, metrics], axis = 1)
        return results

We use our describe function

In [211]:
des = describing()

In [212]:
des.describe(X = full_data)

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
Count,1953.0,1955.0,1956.0,1961.0,1955.0,1955.0,1957.0,1946.0,1955.0,1960.0,1952.0,2000.0,2000.0
Mean,49724.612903,41.451669,1.189644,-0.417844,3.205737,-223.708112,495.785943,2.936453,1030.253905,5.915848,-0.038082,-243.335749,22.2399
std,16440.597298,518.518924,5.18572,5.191859,4.102718,488.374478,105.322043,4.402379,44.34894,3.160984,0.981151,8.768349,97.101175
min,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25%,38819.0,-488.390242,-4.298904,-5.243928,3.182,-578.064907,398.497892,2.218653,1026.488211,3.557049,-0.659897,-250.58636,-40.63
50%,49114.0,272.071636,3.508966,-2.70741,4.634,-415.425616,467.730624,4.35934,1045.78509,5.858556,-0.024521,-244.8318,-2.51
75%,60698.0,521.979825,5.417321,4.881403,5.657,253.349863,596.599814,5.781732,1058.654743,8.236909,0.622949,-232.59589,49.93
max,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1099.966073,13.536762,3.205525,-225.42814,282.43
