# Standard Functions

In [None]:
from IPython.core.display import display, HTML

# Make notebooks expand 100%.
display(HTML("<style>.container { width:100% !important; }</style>"))

In [33]:
# This method returns the data types of a dataframe. Useful for optimisations.
# Parameters 
# file_path (String) : Locaiton of data frame.
# Returns : A dictionary that contains the column name and the column type.
def data_types(file_path):
    df = pd.read_csv(file_path, nrows=100)

    dtypes = df.dtypes
    
    colnames = dtypes.index
    
    types = [i.name for i in dtypes.values]
    
    column_types = dict(zip(colnames, types))

    return(column_types)

In [34]:
# This method finds all columns of a data frame that contain at least NA value.
# Parameters 
# dataframe (DataFrame) : The data frame we want to check.
# Returns : The sum of empty values for each column that have at least one NA value.
def empty_columns(df):    
    columns_with_na = df.isnull().sum()
    
    columns_with_na = columns_with_na[columns_with_na>0]
    
    columns_with_na = columns_with_na.sort_values(ascending=False)
    
    return(columns_with_na)

In [35]:
# This method creates a correlation heatmap.
# Parameters 
# dataframe (DataFrame) : The data frame we want to use.
# round_to (Integer) : Where to round.
# annot_ (Boolean) : Show or not the correlation number.
# fig_x (Integer) : Size of x.
# fig_y (Integer) : Size of y.
# size (Integer) : Image size.
# Returns : An image.
def correlation_heatmap(df, round_to=2, annot_=True, fig_x=30, fig_y=15, size_=14):
    corrmat = df.corr()
    
    cormat = round(corrmat, round_to)
    
    plt.subplots(figsize=(fig_x,fig_y))
    
    result = sns.heatmap(cormat, annot=annot_, annot_kws={"size": size_})
    
    return(result)

In [36]:
# This method gives the most correlated values against a given column.
# Parameters 
# dataframe (DataFrame) : The data frame we want to use.
# corelation_ (Float) : Percentage of correlation.
# diag_kind_ (String) : Kind of Diag.
# Returns : A plot with the most correlated values.
def plot_most_correlated(df, col, correlation_=0.6, diag_kind_='kde'):
    try:
        columns = df.corr().index[abs(df.corr()[col])>=correlation_]
    
        sns.pairplot(df[columns], diag_kind=diag_kind_)
    except:
        print("The column that you provided is not valid for this operation. Try another one.")
        return -1
    
    return(plt.show())

In [37]:
# This method plots a regplot of the given columns.
# Parameters 
# dataframe (DataFrame) : The data frame we want to use.
# column_x (String) : X column.
# column_y (String) : Y column.
# color_ (String) : Color of line.
# alpha (Integer) : Value of alpha.
# lw_ (Integer) : Width of line.
# Returns : A regplot.
def plot_regplot(df, column_x, column_y, color_='r', alpha_=0.7, lw_=5):
    try:
        sns.regplot(x=df[column_x], y=df[column_y], line_kws={"color":color_, "alpha":alpha_, "lw":lw_})
    except:
        print("The combination of columns is not valid! Try others.")
        return -1
    
    return(plt.show())

In [38]:
# This method plots the number of occurance of each value for the given variable.
# Parameters 
# dataframe (DataFrame) : The data frame we want to use.
# column (String) : The column name.
# title (String) : Title of plot.
# x_label (String) : X label of plot.
# y_label (String) : Y label of plot.
# rotation (Integer) : Value of rotation.
# order_ (Boolean) : Choose if the values will be displayed in order.
# Returns : A countplot.
def count_plot(df, column, title='', x_label='', y_label='', rotation_=45, order_=True):
    fig, ax = plt.subplots(figsize=(12,8))
    
    if(order_):
        sns.countplot(x = column, data=df, order = df[column].value_counts().index)
    else:
        sns.countplot(x = column, data=df)
                      
    ax.set_title(title, fontsize=24)
    
    ax.set_xlabel(x_label, fontsize=18)
    
    ax.set_ylabel(y_label, fontsize=18)
    
    ax.set_xticklabels(labels=df[column].unique() ,rotation=rotation_)
    
    return(plt.show())

In [39]:
# This method gives a stack plot of two variables.
# Parameters 
# dataframe (DataFrame) : The data frame we want to use.
# group_by (List) : The column names.
# title (String) : Title of plot.
# x_label (String) : X label of plot.
# y_label (String) : Y label of plot.
# kind_ (String) : Kind of stack.
# stacked_ (Boolean) : Plot bars are stacked or not.
# grid_ (Boolean) : Gird is plotted or not.
# Returns : A plotstack.
def plot_stack(df, group_by, title='', x_label='', y_label='', kind_='bar', stacked_=True, grid_=True):
    try:
        types = df.groupby(group_by).size()

        types.unstack().plot(kind=kind_, stacked=stacked_, figsize=(16,12), grid=grid_, colormap='Spectral')

        plt.ylabel(y_label, fontsize=16)

        plt.xlabel(x_label, fontsize=16)

        plt.xticks(rotation=45, fontsize=14)

        plt.title(title, fontsize=18)
    except:
        print("AttributeError: Something went wrong! Try another column.")
        return -1        
    
    return(plt.show())

In [40]:
# This method gives the log scale values of a column and plots the distribution for the raw data of the columns 
# as well as for the log values of the column.
# Parameters 
# dataframe (DataFrame) : The data frame we want to use.
# title_1 (String) : Title for plot before transformation.
# title_2 (String) : Title for plot after transformation.
# Returns : A plot with the distributions before and after the log transformation.
def plot_log(df, column, title_1='', title_2='Distibution with log transformations'):
    try:
        log_values = np.log(df[column])

        fig, ((x1, x2), (x3, x4)) = plt.subplots(nrows=2, ncols=2, figsize=(14,10))

        plt.suptitle('Probability Plots', fontsize=18)

        x1 = sns.distplot(df[column], color="#fa6868", ax=x1, fit=norm)

        x1.set_title(title_1, fontsize=14)

        x2 = sns.distplot(log_values, color="#34964e",ax=x2, fit=norm)

        x2.set_title(title_2, fontsize=14)

        x3 = stats.probplot(df[column], plot=x3)

        x4 = stats.probplot(log_values, plot=x4)
    except:
        print("ValueError: could not convert the values! String or Infinite error! Make sure the values are valid!")
        return -1

    return(plt.show())

In [None]:
# This method detects outliers using Z-score.
# Parameters
# df (Dataframe) : The data frame we want to use.
# unique (Boolean) : Return or not the unique values from the outliers set.
# Returns : The outliers.
def detect_zscore_outliers(df, unique=True):    
    threshold=3
    
    mean_1 = np.mean(df)
    
    std_1 =np.std(df)
    
    outliers=[]
    
    for y in df:
        z_score= (y - mean_1)/std_1
        
        if np.abs(z_score) > threshold:
            outliers.append(y)
    
    outliers.sort()
    
    if(unique):
        outliers = set(outliers)
    
    return outliers