In [None]:
import numpy                 as np
import pandas                as pd
import matplotlib.pyplot     as plt
import seaborn               as sns
import glob
from functools import reduce

## Analytics Base Table

[reduce()](https://docs.python.org/3/library/functools.html#functools.reduce "Documentation") Apply function of two arguments cumulatively to the items of iterable, from left to right, so as to reduce the iterable to a single value

[lambda](https://book.pythontips.com/en/latest/lambdas.html "Documentation") Lambdas are one line functions. They are also known as anonymous functions in some other languages. You might want to use lambdas when you don’t want to use a function twice in a program. They are just like normal functions and even behave like them.

[pandas.merge()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html "Documentation") Merge DataFrame or named Series objects with a database-style join.

In [None]:
def merge_csv(path_address):
    """
    ---Description---
    
    Arguments:
    `path_address`: 
    
    Outputs:
    `df_merged`: 
    """
    
    all_files = glob.glob(path_address + "/*.csv")
    data_frames = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        data_frames.append(df)
    
    df_merged = reduce(lambda  left,right: pd.merge\
                (left,right,how='outer'), data_frames)
    return df_merged

In [None]:
# Set path string to a variable
path = r'/BigThinkFiles'
# Call function and set to a variable
complaints_df = merge_csv(path)
complaints_df.head()

In [None]:
# Check dataframe shape

### Changing column headers and data types

In [None]:
# Check datatypes within the dataframe

In [None]:
complaints_df.rename(columns = {'':''}, inplace = True)
complaints_df.rename(columns = {'':''}, inplace = True)
complaints_df['Year'] = complaints_df['Year'].astype(str)

In [None]:
# Check datatypes within the dataframe after edits

In [None]:
# Check edited dataframe

## Data Quality Report
---
Continous and Catagorical Quality report

In [None]:
import seaborn as sns
sns.set_theme(style="ticks")

complaints_df_plot = sns.pairplot(complaints_df, hue="Borough")

In [None]:
# generate a correlation matrix from the dataframe, hint: .corr()

[pandas.select_dtypes()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html "Documentation") Return a subset of the DataFrame’s columns based on the column dtypes.


[pandas.isnull()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isnull.html "Documentation") Return a boolean same-sized object indicating if the values are NA

In [None]:
def continous_quality(frame):
    """
    ---Description---
    
    Arguments:
    `frame`: 
    
    Outputs:
    `continous_df`: 
    """
    frame = frame.select_dtypes(include='float64')
    percent_missing = frame.isnull().sum() * 100 / len(frame)
    continous_df = pd.DataFrame({'Count': frame.count(),
                                'percent_missing': percent_missing,
                                'Card':  frame.nunique(),
                                'Min': frame.min(),
                                '1st Quartile': frame.quantile(.25),
                                'Mean': frame.mean(),
                                'Median':frame.median(),
                                '3rd Quartile':frame.quantile(.75),
                                'Max': frame.max(),
                                'Std Dev':frame.std()})
    continous_df.sort_values('percent_missing', inplace=True)
    return continous_df

In [None]:
# run the continous_quality function

In [None]:
# drop features that have ~60% missing, hint: .drop(), axis=1
# display the first five rows of the new altered dataframe

[Counter()](https://docs.python.org/3/library/collections.html#collections.Counter "Documentation") A collection where elements are stored as dictionary keys and their counts are stored as dictionary values.

[most_common()](https://docs.python.org/3/library/collections.html#collections.Counter.most_common "Documentation") Return a list of the n most common elements and their counts from the most common to the least.

In [None]:
from collections import Counter
data = Counter(complaints_df['UHF_42'])
data.most_common()

[List Comprehension](https://www.w3schools.com/python/python_lists_comprehension.asp "Documentation") List comprehension offers a shorter syntax when you want to create a new list based on the values of an existing list.

[numpy.unique()](https://numpy.org/doc/stable/reference/generated/numpy.unique.html "Documentation") Returns the sorted unique elements of an array.

[Series.map()](https://pandas.pydata.org/docs/reference/api/pandas.Series.map.html "Documentation") Map values of Series according to an input mapping or function.


In [None]:
def categorical_quality(frame):
    """
    Creates a dataframe containing data quality metrics
    for each object and int64 feature
    
    Arguments:
    `frame`: dataframe containing categorical variables
    
    Outputs:
    `categorical_df`:Dataframe containing percent missing, cardinality,
                    mode of each feature, count of the Mode, percent of
                    the mode compared to all values, 2nd Mode, percent
                    of the 2nd Mode.
    """

    cat_frame = frame.select_dtypes(exclude='float64', include=['int64', 'object'])
    percent_missing = cat_frame.isnull().sum() * 100 / len(cat_frame)
    categorical_df = pd.DataFrame({ 'percent_missing': percent_missing,
                                    'Card':  cat_frame.nunique(),
                                    'Mode': [Counter(complaints_df[c]).most_common(len(np.unique(complaints_df[c])))\
                                    [0][0] for c in list(cat_frame.columns)],
                                    'Mode Freq.': [Counter(complaints_df[c]).most_common(len(np.unique(complaints_df[c])))\
                                    [0][1] for c in list(cat_frame.columns)],
                                    'Mode %': [Counter(complaints_df[c]).most_common(len(np.unique(complaints_df[c])))\
                                    [0][1] for c in list(cat_frame.columns)],
                                    '2nd Mode': [Counter(complaints_df[c]).most_common(len(np.unique(complaints_df[c])))\
                                    [1][0] for c in list(cat_frame.columns)],
                                    '2nd Mode %':  [Counter(complaints_df[c]).most_common(len(np.unique(complaints_df[c])))\
                                    [1][1] for c in list(cat_frame.columns)]
                                 })
    categorical_df['Mode %'] = categorical_df['Mode %'].div(len(cat_frame)) * 100
    categorical_df['Mode %'] = categorical_df['Mode %'].map('{:,.2f}'.format)
    categorical_df['2nd Mode %'] = categorical_df['2nd Mode %'].div(len(cat_frame)) * 100
    categorical_df['2nd Mode %'] = categorical_df['2nd Mode %'].map('{:,.2f}'.format)
    categorical_df.sort_values('percent_missing', inplace=True)
    return categorical_df

In [None]:
# run the categorical_quality function

## Transformations
---

### Logarithmic Transformation

[numpy.logs()](https://numpy.org/doc/stable/reference/generated/numpy.log.html "Documentation") Natural logarithm, element-wise.

[pandas.concat()](https://pandas.pydata.org/docs/reference/api/pandas.concat.html "Documentation") Concatenate pandas objects along a particular axis with optional set logic along the other axes.

In [None]:
def logTransformation(df):
    """ 
    ---Description---

    Arguments:
    `df`: 
    
    Outputs:
    `result`:
    """
    df_copy = df.copy()
    logCounts = np.log(df_copy.select_dtypes(include=''))
    labels = df_copy.select_dtypes(include=[''], exclude='')
    result = pd.concat([labels, logCounts.reindex(labels.index)], axis=1)
    return result

In [None]:
# run the logTransformation function as display the first five rows

### Square Root Transformation

In [None]:
def sqrtTransformation(df):
    """ 
    ---Description---

    Arguments:
    `df`:
    
    Outputs:
    `result`: 
    """
    df_copy = df.copy()
    sqrtCounts = df_copy.select_dtypes(include='')**.5
    labels = df_copy.select_dtypes(include=[''], exclude='')
    result = pd.concat([labels, sqrtCounts.reindex(labels.index)], axis=1)
    return result

In [None]:
# run the sqrtTransformation function as display the first five rows

### Standardization

[zip()](https://www.programiz.com/python-programming/methods/built-in/zip "Documentation") The function takes iterables (can be zero or more), aggregates them in a tuple, and returns it

In [None]:
def standardization(df):
    """ 
    ---Description---

    Arguments:
    `df`: 
    
    Outputs:
    `result`: 
    """
    df_copy = df.copy()
    dtypes = list(zip(df_copy.dtypes.index, map(str, df_copy.dtypes)))
    # Normalize numeric columns.
    for column, dtype in dtypes:
        if dtype == '':
            df_copy[column] -= df_copy[column].mean()
            df_copy[column] /= df_copy[column].std()
    result = df_copy
    return result

In [None]:
# run the standardization function as display the first five rows

### Clamp Transformation
***
Based on the outlier of your data you can use a bell
 curve to inform your percentile cut offs
***

<img src="https://upload.wikimedia.org/wikipedia/commons/5/5c/PR_and_NCE.gif" style="width:800px;height:400px"/>

In [None]:
# Based on the outlier of your data you can use a bell
# curve to inform your percentile cut cut offs 
def clampTransformation(df):
    """ 
    ---Description---

    Arguments:
    `dataset`: 
    
    Outputs:
    `data`: 
    """
    
    result = df.copy()
    for c in list(df.columns):
        if df[c].dtype == 'float64':
            result[c] = result[c].apply(lambda x:  np.random.randint\
                        (result[c].quantile(0.34), result[c].quantile(0.68))
                        if x > result[c].quantile(.94) 
                        or x < result[c].quantile(.03) else x )
    return result

In [None]:
# run the standardization function as display the first five rows

### QQ-Plot and Histogram to check Normalization

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
stats.probplot(x=complaints_df[''], dist="norm", plot=plt)
plt.title("QQ Plot for ------")
plt.show()

In [None]:
vari_count = complaints_df['']
D_close = vari_count.plot.hist()
D_close.set_title('Histogram of ------')
D_close.set_xlabel('count')
plt.plot()
plt.axvline(vari_count.mean(), color='y', linestyle='solid', linewidth=2)
plt.axvline(vari_count.min(), color='r', linestyle='solid', linewidth=2)
plt.axvline(vari_count.max(), color='r', linestyle='solid', linewidth=2)
plt.axvline(vari_count.median(), color='g', linestyle='solid', linewidth=2)
min_ylim, max_ylim = plt.ylim()
plt.text(vari_count.mean(), max_ylim*0.9, '{:.2f} (mean)'.format(vari_count.mean()))
plt.text(vari_count.max(), max_ylim*1.2, '{:.2f}'.format(vari_count.max()))
plt.text(vari_count.min(), max_ylim*1.2, '{:.2f}'.format(vari_count.min()))
plt.text(vari_count.median(), max_ylim*0.75, '{:.2f} (median)'.format(vari_count.median()))

### Box and whisker analysis
***
The box plot shape will show if a statistical data set is normally
distributed or skewed. When the median is in the middle of the box, and the 
whiskers are about the same on both sides of the box, then the distribution is symmetric.
***
[Box and Whisker](https://www.simplypsychology.org/boxplots.html#:~:text=The%20box%20plot%20shape%20will,then%20the%20distribution%20is%20symmetric."Documentation") 

In [None]:
# Try some transformation to see if you can get 
# box plots that indicate a normal distribution
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x ='Borough',y ='', data=complaints_df)

## Imputation
---

### Mean, Median, Mode, and Random Imputation

In [None]:
def meanImputation(data):
    """ 
    ---Description---
    
    Arguments:
    `data`: 
    
    Outputs:
    `result`: 
    """
    column_means = data.mean()
    result = data.fillna(column_means)
    return result
    

In [None]:
def medianImputation(data):
    """ 
    ---Description---
    
    Arguments:
    `data`: 
    
    Outputs:
    `result`: 
    """ 
    column_medians = data.median()
    return data.fillna(column_medians)

In [None]:
#Mode imputation is typically for categorical variables
def modeImputation(data):
    """ 
    ---Description---
    
    Arguments:
    `data`: 
    
    Outputs:
    `result`: 
    """
    column_modes = data.mode()
    return data.fillna(column_modes)

In [None]:
def randImputation(df):
    """ 
    ---Description---
    
    Arguments:
    `data`: 
    
    Outputs:
    `result`: 
    """
    result = df.copy()
    for c in list(df.columns):
        if df[c].dtype == '':
            result[c] = result[c].fillna(value=np.random.randint
            (complaints_df[c].quantile(0.34), complaints_df[c].quantile(0.68)))
    return result

### Bonus: Interpolation


[Dataframe.interpolate()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html "Documentation") Fill NaN values using an interpolation method.

In [None]:
#‘zero’, ‘slinear’, ‘quadratic’, ‘cubic’, ‘spline’, ‘barycentric’, ‘polynomial’
# some of the parameters may need the argument "order="
continous_quality(complaints_df.interpolate(method='linear'))
#complaints_df.interpolate(method='linear')

In [None]:
# bfill, ffill
continous_quality(complaints_df.fillna(method='ffill'))
#complaints_df.fillna(method='ffill').head(10)

In [None]:
# try rapping complaints_df in the imputation and transformation functions to see how the graphs change
complaints_df\
.hist(column=['asbestos_count',
              'dust_count',
              'gasses_count',
              'mold_count',
              'ventilation_count'],
bins=10, figsize=(12, 8), alpha=0.6, grid=False, rwidth=0.8)
plt.show()
plt.show()

### Additional Functions

[Dataframe.std()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.std.html "Documentation") Return sample standard deviation over requested axis.

[Dataframe.min()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.min.html "Documentation") Return the minimum of the values over the requested axis.

[Dataframe.mean()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mean.html "Documentation") Return the mean of the values over the requested axis.

[Dataframe.div()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.div.html "Documentation") Get Floating division of dataframe and other, element-wise.