In [1]:
import pandas as pd
import numpy as np
from scipy import stats 
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('housing.csv') #reading data set

In [3]:
dataset.head() #peeking to see what the data set is like

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
def convert_array_df(measure, col):
    '''
    This function converts the inputed the array to a rectangular data structure(dataframe), so we can appreciate 
    the results better. This function accepts two parameters
    1. Array of the measure
    2. Columns
    '''
    results = {
        'Features': col,
        'Measure' : measure
    }
    results = pd.DataFrame(results)
    return results

In [5]:
def only_num(data):
    '''
    This function aids the extraction of the numeric values and columns
    '''
    numerical_values = data.select_dtypes(exclude=['object'])
    numerical_columns = data.select_dtypes(exclude=['object']).columns
    return numerical_values, numerical_columns
    
    
def only_cat(data):
    '''
    This function aids the extraction of the categorical values and columns
    '''
    categorical_values = data.select_dtypes(include=['object'])
    categorical_columns = data.select_dtypes(include=['object']).columns
    return categorical_values, categorical_columns

## Measures of central tendency to be considered
- Mean
- Trimmed Mean
- Weighted Mean
- Median
- Weighted Median 
- Mode

### Mean

In [6]:
def get_mean(data):
    '''
    This function calculates the mean of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    num,col =  only_num(data)
    array = np.array(num)
    n_mean = np.nanmean(array, axis=0) 
    return convert_array_df(n_mean, col)

In [7]:
#calulcating the mean of the dataset
get_mean(dataset)

Unnamed: 0,Features,Measure
0,longitude,-119.569704
1,latitude,35.631861
2,housing_median_age,28.639486
3,total_rooms,2635.763081
4,total_bedrooms,537.870553
5,population,1425.476744
6,households,499.53968
7,median_income,3.870671
8,median_house_value,206855.816909


### Trimmed Mean

In [8]:
def get_trimmed_mean(data, truncation):
    '''
    This function calculates the trimmed mean and makes use of the scipy library, the truncation represents the 
    portion (in percentage) of the  extreme values that is desired to be removed. It accepts two parameters
    1. Dataset
    2. Truncation value
    '''
    num,col =  only_num(data)
    array = np.array(num)
    tmean = stats.trim_mean(array, truncation)
    return convert_array_df(tmean, col)

In [9]:
#calulcating the trimmed mean of the dataset
get_trimmed_mean(dataset,0.1)

Unnamed: 0,Features,Measure
0,longitude,-119.518129
1,latitude,35.508249
2,housing_median_age,28.494549
3,total_rooms,2294.557837
4,total_bedrooms,477.576248
5,population,1256.5129
6,households,441.201793
7,median_income,3.654012
8,median_house_value,192772.995397


### Weighted Mean

In [10]:
def get_weighted_mean(data, weight_name):
    '''
    This function calculates the weighted mean and makes use on masked arrays. it accepts two parameters
    1. Dataset
    2. Name of feature desired to be used as weight as a string
    '''
    num,col =  only_num(data)
    col = data.select_dtypes(exclude=['object']).columns 
    weight = data[weight_name]
    wt_avg = []
    for item in col:
        if item == weight_name:
            ma = np.ma.MaskedArray(data[item], mask=np.isnan((data[item])))
            w_avg = np.ma.average(ma, axis=0, weights=weight)
            ### This tends to calculate the weighted mean on the weights, so to correct this we divide by 2
            w_avg = w_avg/2
            wt_avg.append(w_avg)
        else:
            ma = np.ma.MaskedArray(data[item], mask=np.isnan((data[item])))
            w_avg = np.ma.average(ma, axis =0, weights=weight)
            wt_avg.append(w_avg)
            
    return convert_array_df(wt_avg,col)    

In [11]:
#calulcating the weighted mean of the dataset and using the feature 'population' as the weight
get_weighted_mean(dataset, 'population')

Unnamed: 0,Features,Measure
0,longitude,-119.410904
1,latitude,35.447274
2,housing_median_age,25.677624
3,total_rooms,4121.237998
4,total_bedrooms,831.999819
5,population,1162.555715
6,households,775.085802
7,median_income,3.877967
8,median_house_value,204596.156992


### Median

In [12]:
def get_median(data):
    '''
    This function calculates the median of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    num,col =  only_num(data)
    array = np.array(num)
    n_median = np.nanmedian(array, axis=0)
    return convert_array_df(n_median, col)

In [13]:
#calulcating the median of the dataset
get_median(dataset)

Unnamed: 0,Features,Measure
0,longitude,-118.49
1,latitude,34.26
2,housing_median_age,29.0
3,total_rooms,2127.0
4,total_bedrooms,435.0
5,population,1166.0
6,households,409.0
7,median_income,3.5348
8,median_house_value,179700.0


### Weighted Median

In [14]:
def get_weighted_median(data, weight_name):
    '''
    This function calculates weighted median, it makes use of wquantile library. it accepts only two parameter
    1. Dataset
    2. Name of feature desired to be used as weight as a string
    '''
    # ! pip install wquantiles
    import weighted
    
    num,col =  only_num(data)
    weight_name = weight_name.lower()
    weight = data[weight_name]
    wt_median = []
    
    for item in col:
        if item == weight_name:
            w_median = weighted.median(data[item], weights=weight)
            w_median = w_median/2
            wt_median.append(w_median)
        else:
            w_median = weighted.median(data[item], weights=weight)
            wt_median.append(w_median)
            
    return convert_array_df(wt_median, col)

In [15]:
#calulcating the weighted_median of the dataset
get_weighted_mean(dataset, 'population')

Unnamed: 0,Features,Measure
0,longitude,-119.410904
1,latitude,35.447274
2,housing_median_age,25.677624
3,total_rooms,4121.237998
4,total_bedrooms,831.999819
5,population,1162.555715
6,households,775.085802
7,median_income,3.877967
8,median_house_value,204596.156992


### Mode

In [16]:
def get_mode(data):
    '''
    This function calculates the mode of the categorical features, it makes use of wquantile library. it accepts only one parameter
    1. Dataset
    '''
    cat,col = only_cat(data)
    n_mode = stats.mode(cat)
    return n_mode

In [17]:
#calulcating the mode of the dataset
get_mode(dataset)

ModeResult(mode=array([['<1H OCEAN']], dtype=object), count=array([[9136]]))

## Measure of Dispersion
- Range
- Interquartile range
- Mean absolute deviation
- Variance
- Standard deviation 
- Median absolute deviation

### Range

In [18]:
def get_range(data):
    '''
    This function calculates the range of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    
    num, col = only_num(data)
    
    results = []
    for item in col:
        item_range = []
        for value in data[item]:
            item_range.append(value)
            limit1 = min(item_range)
            limit2 = max(item_range)
            range_ = limit2- limit1
        results.append(range_)
    return convert_array_df(results, col)

In [19]:
#calulcating the range of the dataset
get_range(dataset)

Unnamed: 0,Features,Measure
0,longitude,10.04
1,latitude,9.41
2,housing_median_age,51.0
3,total_rooms,39318.0
4,total_bedrooms,6444.0
5,population,35679.0
6,households,6081.0
7,median_income,14.5002
8,median_house_value,485002.0


### Interquartile range

In [20]:
def get_IQR(data):
    '''
    This function calculates the Interquartile range of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    num,col =  only_num(data)
    results = []
    for item in col:
        result = stats.iqr(data[item],nan_policy='omit') # The NaN defines how to handle when input contains nan.
        results.append(result)
    return convert_array_df(results, col)

In [21]:
#calulcating the IQR of the dataset
get_IQR(dataset)

Unnamed: 0,Features,Measure
0,longitude,3.79
1,latitude,3.78
2,housing_median_age,19.0
3,total_rooms,1700.25
4,total_bedrooms,351.0
5,population,938.0
6,households,325.0
7,median_income,2.17985
8,median_house_value,145125.0


### Mean absolute deviation

In [22]:
def mean_absolute_dev(data):
    '''
    This function calculates the Mean absolute deviation of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    num,col =  only_num(data)
    results = []
    for item in col:
        ### computing the formula 'mean(abs(data-mean(data)))'
        a = np.nanmean(np.array(data[item]))
        b = np.array(data[item])
        result = np.nanmean(np.absolute(b-a))
        results.append(result)
    return convert_array_df(results, col)
        

In [23]:
#calulcating the Mean absolute deviation of the dataset
mean_absolute_dev(dataset)

Unnamed: 0,Features,Measure
0,longitude,1.830206
1,latitude,1.975024
2,housing_median_age,10.551539
3,total_rooms,1344.462236
4,total_bedrooms,270.923606
5,population,714.237277
6,households,247.195367
7,median_income,1.401614
8,median_house_value,91170.439944


### Variance

In [24]:
def get_var(data):
    '''
    This function calculates the Variance of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    num,col =  only_num(data)
    array = np.array(num)
    var = np.nanvar(array, axis=0) 
    return convert_array_df(var, col)

In [25]:
#calulcating the Variance of the dataset
get_var(dataset)

Unnamed: 0,Features,Measure
0,longitude,4.013945
1,latitude,4.562072
2,housing_median_age,158.3886
3,total_rooms,4759215.0
4,total_bedrooms,177556.7
5,population,1282408.0
6,households,146169.0
7,median_income,3.609148
8,median_house_value,13315500000.0


### Standard deviation

In [26]:
def get_std(data):
    '''
    This function calculates the Standard deviation of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    num,col =  only_num(data)
    array = np.array(num)
    std = np.nanstd(array, axis=0) 
    return convert_array_df(std, col)

In [27]:
#calulcating the Standard deviation of the dataset
get_std(dataset)

Unnamed: 0,Features,Measure
0,longitude,2.003483
1,latitude,2.135901
2,housing_median_age,12.585253
3,total_rooms,2181.562402
4,total_bedrooms,421.374759
5,population,1132.434688
6,households,382.320491
7,median_income,1.899776
8,median_house_value,115392.820404


### Median absolute deviation (MAD)

In [28]:
def get_MAD(data):
    '''
    This function calculates the Median absolute deviation of the numeric variables in the data set. It accepts only one parameter
    1. Dataset
    '''
    num,col =  only_num(data)
    results = []
    for item in col:
        result = stats.median_absolute_deviation(data[item],nan_policy='omit') # The NaN defines how to handle when input contains nan.
        results.append(result)
    return convert_array_df(results, col)

In [29]:
#calulcating the Median absolute deviation of the dataset
get_MAD(dataset)

Unnamed: 0,Features,Measure
0,longitude,1.897728
1,latitude,1.823598
2,housing_median_age,14.826
3,total_rooms,1181.6322
4,total_bedrooms,240.1812
5,population,652.344
6,households,223.8726
7,median_income,1.577783
8,median_house_value,101409.84
