In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 
import plotly.graph_objs as pgo
import sweetviz as sv


In [None]:
Org_df=pd.read_csv("OnlineRetail.csv",index_col=0,encoding='unicode_escape')
Org_df.shape

In [None]:
df=Org_df.reset_index()
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

# univariate analysis

In [None]:
df.InvoiceNo.value_counts()

In [None]:
px.histogram(df,x="StockCode")

In [None]:
df['Description'].value_counts()

In [None]:
px.box(df, x="Quantity")


In [None]:
df[df["Quantity"]>=74215].count()

In [None]:
df['InvoiceDate']=df["InvoiceDate"].astype('datetime64')
#Splitting Date Column
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Week'] = df['InvoiceDate'].dt.week

In [None]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [10, 5]})
sns.countplot(data=df,x="Month",hue="Year");

In [None]:
px.box(df,x='UnitPrice')

In [None]:
df[df['UnitPrice']<=0].count()

In [None]:
print("Unique:",df.CustomerID.nunique())
df.CustomerID.value_counts().head()

In [None]:
px.histogram(df,x="Country")

## Data Preprocessing 

#### Data cleaning

In [None]:
# CHECK THE MISSING VALUES
df.isnull().sum()

In [None]:
#check duplicate values
df.duplicated().sum()

In [None]:
df=df.dropna(subset=['CustomerID'])

In [None]:
df=df[(df["Quantity"]>0) & (df["UnitPrice"]>0)]

#### Data Transformation

In [None]:
df['Year_Month']=pd.to_datetime(df["InvoiceDate"].dt.strftime("%Y_%m"),format="%Y_%m")
df['CustomerID']=df['CustomerID'].astype('int').astype('str')

In [None]:
#new column
df["Amount"]=df["Quantity"]*df["UnitPrice"]
df.head()

In [None]:
# creating dataframe and storing csv file
from pathlib import Path  
df_amount_month=df.groupby(["CustomerID","Year_Month"]).agg({"Amount":"sum"}).sort_values(by="Amount",ascending=False).reset_index()
filepath = Path('amount_df_month.csv')
filepath.parent.mkdir(parents=True, exist_ok=True) 
df_amount_month.to_csv(filepath)

In [None]:
filepath = Path('amount_df.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
df_amount=df.groupby(["CustomerID"]).agg({"Amount":"sum"}).sort_values(by="Amount",ascending=False)
df_amount.to_csv(filepath)