# Exploratory Analysis

In [None]:
import pandas as pd  # tested on version 1.1.5
from tqdm import tqdm
pd.options.mode.chained_assignment = None


class ExploratoryAnalysis:
    """"
    ExploratoryAnalysis prints specific analysis which represent
     the answer to each question of the Data Engineering Bootcamp.

    Parameters:
        filename : str
            The system path where the file to be analysed is located.

    Attributes:
        df
        filename
        chunk_size

    Methods:
        chains_cnt (answer to question 1)
        prod_by_state (answer 2)
        top_chain (answer 3)
        prices_per_range (answer 4)
    """
    def __init__(self, filename):
        self.df = pd.DataFrame()
        self.filename = filename
        self.chunk_size = 10 ** 7

    # question 1 answer
    def chains_cnt(self):
        """
        Prints the total number of unique chains.

        Reads the 'cadenaComercial' column to obtain all the existing
         commercial chains without duplicates.

        Returns:
            None

        Typical usage example:
        analysis = ExploratoryAnalysis(file_name)
        analysis.chains_cnt()
        """
        self._read_columns('cadenaComercial')
        print(len(self.df.unique()))
        del self.df

    # question 2 answer
    def prod_by_state(self, top_products):
        """
        Prints the top n (by amount) products by state.

        Uses the state and product columns to group them together. Results
         in a dataframe with three columns: state, product and frequency.

        Args:
            top_products: int.
                Number of products to be printed per state.

        Returns:
            None

        Typical usage example:
        analysis = ExploratoryAnalysis(file_name)
        analysis.prod_by_state(10)
        """
        self._read_columns(['producto', 'estado'])

        # reset_index is used to convert a Series object (returned by .size())
        # to a DataFrame.
        # At the same time it keeps the state column as part of the dataframe
        # (which is the index in the Series)
        self.df = self.df.groupby(by=['estado', 'producto'], dropna=True).size()
        self.df = self.df.reset_index()

        # sort values in a descending fashion to get the top n products by state
        self.df.sort_values(by=['estado', 0],
                            ascending=[True, False],
                            inplace=True)
        self.df = self.df.groupby('estado').head(top_products)
        self.df.rename(columns={0: 'cantidad'}, inplace=True)
        self.df.reset_index(drop=True, inplace=True)

        print(self.df.iloc[:-1, :])
        del self.df

    # question 3 answer
    def top_chain(self):
        """
        Prints the chain with the most products.

        Reads the 'cadenaComercial' column to obtain the chain with the most
         products including the number of products for the chain.

        Returns:
            None

        Typical usage example:
        analysis = ExploratoryAnalysis(file_name)
        analysis.top_chain()
        """
        self._read_columns('cadenaComercial')
        self.df = self.df.value_counts().reset_index().iloc[0, :]
        self.df.rename({0: 'Numero de productos'}, inplace=True)
        print(self.df)
        del self.df

    # question 4 answer
    def prices_per_range(self):
        """
        Prints the count of products for ten price ranges.

        Works with the price column only. Shows the max and min price per range
         along with the number of products in that price range.

        Returns:
            None

        Typical usage example:
        analysis = ExploratoryAnalysis(file_name)
        analysis.prices_per_range()
        """
        self._read_float_columns('precio')
        range_size = (self.df.max() - self.df.min()) / 10

        for cnt, i in enumerate(range(10)):
            min_price = (self.df.min() + cnt * range_size)
            max_price = ((self.df.min() + cnt * range_size) + range_size)
            print('Min:', min_price, ' Max: ', max_price)

            min_val = (self.df.min() + cnt * range_size) - 0.01
            max_val = ((self.df.min() + cnt * range_size) + range_size)
            cond = (self.df.values > min_val) & (self.df.values <= max_val)
            print(len(self.df[cond]))
        del self.df

    # --------------------------------------------------------------------
    # Private methods

    def _read_columns(self, col):
        """
        Reads the file in chunks.

        Saves all the rows in the file to self.df for the given column or list
         of columns.

        Args:
            col: str or list.
                Columns to be read into the series or dataframe.
        Returns:
            None

        Typical usage example:

        self._read_float_columns('precio')
        """
        cnt = 0
        if type(col) == list and len(col) > 1:
            self.df = pd.DataFrame()
        else:
            self.df = pd.Series()

        read_iter = pd.read_csv(self.filename, chunksize=self.chunk_size)
        for chunk in tqdm(read_iter):
            chunk = chunk[col]
            chunk.dropna(inplace=True)
            if cnt == 0:
                self.df = chunk
            else:
                self.df = pd.concat([self.df, chunk])
            cnt += 1

    def _read_float_columns(self, col):
        """
        Reads the file in chunks.

        Saves all the rows in the file to self.df for the given column or list
         of columns.

        Args:
            col: str or list.
                Columns to be read into the series or dataframe.

        Returns:
            None

        Typical usage example:

        an = ExploratoryAnalysis()
        an.FunctionBar()
        """
        cnt = 0
        if type(col) == list and len(col) > 1:
            self.df = pd.DataFrame()
        else:
            self.df = pd.Series()

        read_iter = pd.read_csv(self.filename, chunksize=self.chunk_size)
        for chunk in tqdm(read_iter):
            chunk = chunk[col]
            chunk.dropna(inplace=True)

            if cnt == 0:
                self.df = chunk.astype(str)
                # remove any characters that are not numbers
                str_cond = self.df.str.contains('[a-zA-Z]+', regex=True)
                self.df = self.df[~str_cond]
            else:
                self.df = pd.concat([self.df, chunk]).astype(str)
                str_cond = self.df.str.contains('[a-zA-Z]+', regex=True)
                self.df = self.df[~str_cond]
                self.df = self.df.astype(float)
            cnt += 1

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

file_name = 'C:/Users/Geovanni/Documents/all_data.csv'
analysis = ExploratoryAnalysis(file_name)
analysis.chains_cnt()
analysis.prod_by_state(10)
analysis.top_chain()
analysis.prices_per_range()