## Importing necessary things

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')
import math
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight') 
%matplotlib inline

from collections import Counter
import datetime

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm

import statsmodels.api as sm
from numpy.random import normal, seed
from scipy.stats import norm
from statsmodels.tsa.arima_model import ARMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('train.csv')
df

## Checking each column for more information about types

In [None]:
df['Ship Mode'].value_counts()

In [None]:
shipmode1 = len(df[df['Ship Mode'] == 'Standard Class'])
shipmode2 = len(df[df['Ship Mode'] == 'Second Class'])
shipmode3 = len(df[df['Ship Mode'] == 'First Class'])
shipmode4 = len(df[df['Ship Mode'] == 'Same Day'])
total = shipmode1 + shipmode2 + shipmode3 + shipmode4


plt.figure(figsize=(10,10))

xaxis = ['Standard Class', 'Second Class', 'First Class', 'Same Day']
yaxis = [shipmode1, shipmode2, shipmode3, shipmode4]
ax = sns.barplot(xaxis, yaxis)

plt.title('Distribution of Shipping Options.')
plt.ylabel('Count', fontsize = 20)
plt.xlabel('Shipping Type', fontsize = 20)


for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/total)+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=15, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
df['City'].value_counts()

In [None]:
citycount = Counter(df['City'])
citycount.most_common(10)

In [None]:
yaxiscity = [val[1] for val in citycount.most_common(10)]

xaxiscity = [val[0] for val in citycount.most_common(10)]

plt.figure(figsize=(18, 14))
ax = sns.barplot(xaxiscity, yaxiscity)
plt.title('Top 10 Most Common Cities that order', fontsize = 25)
plt.ylabel('Frequency', fontsize = 20)
plt.xticks(rotation=45, fontsize = 15)

for p in ax.patches:
             ax.annotate("%.2f" % (p.get_height()*100/sum(yaxiscity))+'%', (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=13, color='black', xytext=(0, 20),
                 textcoords='offset points')

In [None]:
df['Segment'].value_counts()

In [None]:
df['State'].value_counts()

In [None]:
df['Region'].value_counts()

In [None]:
df['Category'].value_counts()

In [None]:
df['Sub-Category'].value_counts()

## Checking for Null Values

In [None]:
df.isnull().sum()

In [None]:
df[df['Postal Code'].isnull()]

In [None]:
df[df['City'] == 'Burlington']
# Only Burlington city in Vermont State is missing postal code.

In [None]:
df['Postal Code'] = df['Postal Code'].fillna(5401)

In [None]:
df.isnull().sum()

## Feature Engineering

First, need to convert Order date and Ship Date from object into Date Time

In [None]:
df.dtypes.head(5)

In [None]:
df['Order Date'] = pd.to_datetime(df['Order Date'], format= '%d/%m/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], format= '%d/%m/%Y')

In [None]:
print(df['Order Date'].head(2))
print(df['Ship Date'].head(2))

In [None]:
df['ShippingTime'] = df['Ship Date'] - df['Order Date']
df['ShippingTime']

## More Visualizations

#### Looking at the top customers based on how much they have spent.

In [None]:
topcust = df.groupby(["Customer Name"]).sum().sort_values("Sales", ascending=False).head(10)
topcust = topcust[["Sales"]].round(2)
topcust.reset_index(inplace=True)

In [None]:
plt.figure(figsize = (18,10)) 
plt.title("Most Valuable Customers (2015-2019)", fontsize=25) 
plt.bar(topcust["Customer Name"], topcust["Sales"],color= '#ECFC08',edgecolor='black', linewidth = 1)
plt.xlabel("Customers",fontsize=20)  
plt.ylabel("Revenue",fontsize=20) 
plt.xticks(fontsize=15, rotation=45)
plt.yticks(fontsize=12)
for k,v in topcust["Sales"].items(): 
    plt.text(k,v-8000,'$'+ str(v), fontsize=20,rotation=90,color='k', horizontalalignment='center');