In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from flask import jsonify, request
from datetime import datetime as dt

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
df_main = pd.read_csv("pagelite_main.csv", index_col=0)
df_login = pd.read_csv("pagelite_login.csv", index_col=0)
df_pricing = pd.read_csv("pagelite_pricing.csv", index_col=0)

In [3]:
class ChartData:
    
    def __init__(self,  page, y_name, x_name, data_points):
        """
        chart_type: enum | Type of the chart. Can be line, bar or circular
        y_name: str | Name of the column that was used for y values
        x_time: str | Name of the column that was used for x values, usually date column
        """
        self.page = page
        self.y_name = y_name
        self.x_name = x_name
        self.data_points = data_points
        
    def __repr__(self):
        return f"dfd"

In [28]:
class DataPoint:
    
    def __init__(self, y, x = None, label = None, group = None):
        """
            x: int/float, x value, can be None
            y: int/float, y value
            label: str, label of the point, can be None
            group: str, group (A/B), can be None 
        """
        self.x = x
        self.y = y
        self.label = label
        self.group = group
        
    def __repr__(self):
        return f"{self.y} | {self.x} | {self.label} | {self.group}"

In [29]:
class DataManipulation:
    
    def _subset(self, df, by_group):
        """
        A function used to subset a dataframe by a given group
        
        Attributes
        ------------
        df: dataframe | The dataframe that we want to subset
        by_group: int | The group that is used for subsetting (0,1,...,9)
        
        Returns
        ------------
        The given dataframe subsetted by the given group
        """
        self.df = df
        self.by_group = by_group
    
        self.df = self.df[self.df.group == self.by_group]
        
        return self.df

    def get_chart_data(self, df, column, group=None):
        
        #json = request.get_json()
        #page_name = json["page_name"]
        #group = json["group"]
        #column = json["column"]
        
        #if page_name:
        #    root_dir = _root_directory()
        #    page_dir = f"{root_dir}/static/{page_name}"
        #else:
        #    return None, 499

        #page_data = pd.read_csv(page_dir)
        
        """
        A function used to get the data needed for creating the chart
        
        Attributes
        ------------
        df: dataframe | The dataset for the given page
        column: str | The column used for y values
        group: int | The given group that is used for subsetting (0,1,...,9), can be None
        
        Returns
        ------------
        A dataframe containing the column values, date and group (if given)
        """
        self.df = df
        self.column = column
        self.group = group
        
        if self.group or self.group==0:
            self.df = self._subset(self.df, int(self.group))
            if self.column:
                self.df = self.df[["date", self.column, "group"]]
                return self.df
            else:
                return None, 498
        
        else:
            if self.column:
                self.df = self.df[["date", self.column]]
                return self.df
            else:
                return None, 497
        
    def get_data_with_frequency(self, df, freq, column, group=None):
        """
        A function used to aggregate the data by the given column and time period
        
        Attributes
        ------------
        df: dataframe | The dataset for the given page
        freq: str | The time period used for aggregating the data ("1H", "2H", ..., "1D", "1W", "1M")
        column: str | The column used for y values
        group: int | The given group that is used for subsetting (0,1,...,9), can be None
        
        Returns
        ------------
        The aggregated dataframe containing the column values, date and group (if given)
        """
        self.df = df
        self.freq = freq
        self.column = column
        self.group = group
            
        self.df = self.get_chart_data(self.df, self.column, self.group)
        dates =  pd.to_datetime(self.df.date, format='%Y-%m-%d %H:%M:%S')
        self.df.date = dates
        self.df["date_index"] = pd.DatetimeIndex(dates)
        self.df = self.df.set_index("date_index")
        self.df = self.df.sort_values("date")
        
        isStringType = self.df.dtypes[column] == "object"
        if isStringType:
            series = self.df.groupby(pd.Grouper(key = column)).count()["date"]
        else:
            series = self.df.groupby(pd.Grouper(key = 'date', freq = freq)).mean()[column]
        
        return series
    
    def create_data_points(self, df, freq, column, group=None):
        """
        A function used to create data points from the given dataframe
        
        Attributes
        ------------
        df: dataframe | The dataset for the given page
        freq: str | The time period used for aggregating the data ("1H", "2H", ..., "1D", "1W", "1M")
        column: str | The column used for y values
        group: int | The given group that is used for subsetting (0,1,...,9), can be None
        
        Returns
        ------------
        A list of DataPoint objects
        """
        self.df = df
        self.freq = freq
        self.column = column
        self.group = group
        
        series = self.get_data_with_frequency(self.df, self.freq, self.column, self.group)
        
        data_points = []
        
        for ind, element in enumerate(series):
            
            try:
                x = int(dt.timestamp(series.index[ind]))
            except:
                x = None
                
            y = float(series[ind])
            label = str(series.index[ind])
            group = self.group
            
            point = DataPoint(
                x = x,
                y = y,
                label = label,
                group = group
            )
            
            data_points.append(point)

        return data_points
    
    def create_chart(self, page, freq, column, group=None):
        """
        A function used to create a ChartData object from a given dataframe
        
        Attributes
        ------------
        df: dataframe | The dataset for the given page
        freq: str | The time period used for aggregating the data ("1H", "2H", ..., "1D", "1W", "1M")
        column: str | The column used for y values
        chart_type: enum | Type of the chart. Can be line, bar or circular
        group: int | The given group that is used for subsetting (0,1,...,9), can be None
        
        Returns
        ------------
        A ChartData object
        """
        
        self.df = pd.read_csv(f"{page}")
        self.freq = freq
        self.column = column
        self.group = group
        
        self.data_points = self.create_data_points(self.df, self.freq, self.column, self.group)
        dates_converted =  pd.to_datetime(self.df.date, format='%mmm')

        chart = ChartData(
            page = page,
            y_name = self.column,
            x_name = "date",
            data_points = self.data_points
        )
        
        return chart

In [30]:
dm = DataManipulation()
chart_data = dm.create_chart("pagelite_main.csv", "1M", "clicks", None)
chart_data.data_points

NameError: name 'y' is not defined

In [16]:
df_main.groupby(pd.Grouper(key = 'date', freq = "1D")).mean

date
2019-01-01    11.250000
2019-01-02          NaN
2019-01-03    17.000000
2019-01-04    11.250000
2019-01-05     7.500000
                ...    
2019-12-26    11.166667
2019-12-27     9.000000
2019-12-28     9.000000
2019-12-29    12.000000
2019-12-30     8.666667
Freq: D, Name: clicks, Length: 364, dtype: float64

In [53]:
d.get_data_with_frequency(df_main, "M", "lifetime")

date
2019-01-31    30.172308
2019-02-28    32.942278
2019-03-31    31.419823
2019-04-30    32.486061
2019-05-31    31.907865
2019-06-30    33.465882
2019-07-31    31.608224
2019-08-31    32.537308
2019-09-30    30.884211
2019-10-31    30.273191
2019-11-30    32.042162
2019-12-31    33.707234
Freq: M, Name: lifetime, dtype: float64

In [9]:
d.create_chart(df_main, "1D", "country", group=3)

ChartData

In [55]:
df = df_main
freq = "1D"
column = "lifetime"
dates =  pd.to_datetime(df.date, format='%Y-%m-%d %H:%M:%S')
df = (df.assign(date=dates)
           .groupby([column, pd.Grouper(key='date', freq=freq)])
           .mean()
           .reset_index())
df.head()

Unnamed: 0,lifetime,date,id,group,clicks
0,3.0,2019-07-25,1063.0,3.0,10.0
1,3.0,2019-11-04,392.0,8.0,7.0
2,6.0,2019-06-04,485.0,6.0,14.0
3,6.0,2019-07-31,246.0,3.0,9.0
4,6.0,2019-10-26,1127.0,5.0,7.0


In [56]:
df_main


Unnamed: 0,id,group,clicks,lifetime,country,browser,date,date_index
607,607,2,8,21.00,China,Chrome,2019-01-01 01:06:17,2019-01-01 01:06:17
353,353,0,21,45.00,China,Opera,2019-01-01 01:52:03,2019-01-01 01:52:03
1026,1026,7,9,39.00,Italy,Opera,2019-01-01 01:53:04,2019-01-01 01:53:04
1126,1126,6,7,45.00,China,Internet Explorer,2019-01-01 11:52:21,2019-01-01 11:52:21
518,518,8,17,30.00,Italy,Safari,2019-01-03 20:20:56,2019-01-03 20:20:56
...,...,...,...,...,...,...,...,...
444,444,6,13,21.00,Italy,Safari,2019-12-30 05:41:58,2019-12-30 05:41:58
1151,1151,1,5,88.92,China,Chrome,2019-12-30 06:21:45,2019-12-30 06:21:45
554,554,2,6,48.00,India,Chrome,2019-12-30 09:47:22,2019-12-30 09:47:22
192,192,8,8,27.00,India,Chrome,2019-12-30 09:58:03,2019-12-30 09:58:03


In [57]:
df_main.date = pd.to_datetime(df_main.date, format='%Y-%m-%d %H:%M:%S')

In [58]:
df_main = df_main.sort_values("date")

In [59]:
df2 = df_main
dates =  pd.to_datetime(df2.date, format='%Y-%m-%d %H:%M:%S')
df2["date_index"] = pd.DatetimeIndex(dates)
df2 = df2.set_index("date_index")
df2 = df2.sort_values("date")


In [60]:
column = "clicks"
isStringType = df2.dtypes[column] == "object"
if isStringType:
    df3 = df2.groupby(pd.Grouper(key = column)).count()["id"]
else:
    df3 = df2.groupby(pd.Grouper(key='date', freq='M')).mean()[column]


In [None]:
for i, e in enumerate(df3):
    print(df3.index[i], df3[i])

In [None]:
df2.groupby(pd.Grouper(key='date', freq='M')).mean()

In [None]:
df2.groupby(pd.Grouper(key = 'country')).sum()