In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [None]:
# To print full list
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
data = pd.read_csv('./chilliapi_data.csv')

data.head()

In [None]:
data.info()

In [None]:
data.describe()

# Exploring all data

# Convert DOB to age

In [None]:
from datetime import datetime, date

data['DateOfBirth'] = pd.to_datetime(data['DateOfBirth'], format='%d/%m/%Y %I:%M:%S %p')

data['DateOfBirth(Date)'] = data['DateOfBirth'].dt.strftime('%d/%m/%Y')
data['DateOfBirth(Date)'] = data['DateOfBirth(Date)'].replace(np.nan, date.today().strftime("%d/%m/%Y")).astype(str)

def age(born):
    born = datetime.strptime(born, "%d/%m/%Y").date()
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

  
data['Age'] = data['DateOfBirth(Date)'].apply(age)


# ageData contains data whose age can be determined
ageData = pd.DataFrame(data[data['Age'] >0])
ageData['Age'].describe()
# f, axes = plt.subplots(1, 2, figsize=(18, 4))
# sb.boxplot(data = data['Age'], orient = "h", ax = axes[0])
# sb.histplot(data = data['Age'], ax = axes[1])

# Isolated Data to analyse
Base Value ($), Status, Menu Name, FunctionDate

In [None]:
cols = ['Base Value ($)', 'Status', 'Menu Name', 'FunctionDate']
miniData = data[cols]
miniData.head()

In [None]:
miniData['FunctionDate'] = pd.to_datetime(miniData['FunctionDate'], format='%d/%m/%Y %I:%M:%S %p')

miniData['Date'] = miniData['FunctionDate'].dt.strftime('%d/%m/%Y')
miniData['Time'] = miniData['FunctionDate'].dt.strftime('%I:%M:%S %p')

In [None]:
miniData.head()

# Removing cols with major null vals

In [None]:
def naValsInPercentage(data):
    percent_missing = data.isnull().sum() * 100 / len(data)
    return pd.DataFrame({'column_name': data.columns,
                         'percent_missing': percent_missing})
missingVals = naValsInPercentage(data)
missingVals

In [None]:
# remove columns with more than 30% missing values
colsToDrop = missingVals[missingVals['percent_missing']>30]
colsToDrop

### Some columns are still important to analyse
DateOfBirth, AddOnsPrice

In [None]:
finalColsToDrop = colsToDrop.drop(['DateOfBirth', 'AddOnsPrice'])

In [None]:
# CD = Cleaned Data
cdPart1 = data.drop(columns = finalColsToDrop['column_name'])

## Drop Rows without main identifier
# We are using Menu names as the main identifier
rowsToDrop = cdPart1[cdPart1["Menu Name"].isna()]
cdPart1 = cdPart1.drop(index = rowsToDrop.index)

cdPart1[1000:1010]

### Theres still insignificant cols

In [None]:
# suspects
insigCols = ['Block', 'Block (Customer)', 'Building', 'Co Last Name', 'CustomerId (Customer)', 'CustomerIdCopy', 'Delivery Note', 'Driver Name', 'Fax', 'tel', 'Mobile' 'First Name', 'Id', 'Invoice Id', 'Job Number', 'Last Name', 'OrderSource', 'Delivery Rate Internal', 'Delivery Time', 'Delivery Rate', 'Staff No', 'Staff Price', 'Packed Time']
cdPart1[insigCols].describe()

In [None]:
cdPart1 = cdPart1.drop(columns = insigCols)
cdPart1.head()

## Fill NA

In [None]:
reducedMissingVals = naValsInPercentage(cdPart1)
reducedMissingVals

In [None]:
cdPart2 = pd.DataFrame(cdPart1)

In [None]:
# Converts values to boolean
def convertToBool(data, colsToCovert):
    temp = pd.DataFrame(data)
    for col in colsToCovert:
        temp[col] = data[col].notnull().astype('bool')
    return temp

colsToBool = ['Contact Number', 'Contact Person', 'Exported', 'Mobile']
cdPart2 = convertToBool(cdPart2, ['Contact Number'])
cdPart2['Contact Number'].head()