# Orders Exercise

In [None]:
#Import the required libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
from sklearn.metrics import confusion_matrix
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 9999)
pd.set_option('display.max_rows', 300)

In [None]:
#Import the csv file and assign it to a dataframe.
df = pd.read_csv('order_data.csv')

In [None]:
#Investigate the data type of each column
df.dtypes

In [None]:
#Check for null values in each column
df.isnull().sum()

In [None]:
#Take an initial look at the set up of the dataframe
df.head()

In [None]:
df['date'] = pd.to_datetime(df['date']).dt.date #Remove the time portion of the date
df['date'] = pd.to_datetime(df['date']) #Convert the date column from a string to a date 

In [None]:
#Check that the date column type changed to a date
df.dtypes

In [None]:
#Examine the top of the dataframe
df.head(15)

In [None]:
#Examine the bottom of the dataframe
df.tail(15)

## A) Assemble a dataframe with one row per customer and the following columns:
    * customer_id
    * gender
    * most_recent_order_date
    * order_count (number of orders placed by this customer)
   Sort the dataframe by customer_id ascending and display the first 10 rows

In [None]:
#Examine two customer id's before grouping to use as a check
df[(df['customer_id']==8658) | (df['customer_id']==5989)]

In [None]:
#Create a new dataframe for grouping
df_new = df.groupby(['customer_id','gender'], sort=True).agg({'date': 'last','value': 'count'})
df_new = df_new.reset_index() #Set an index for the dataframe
df_new.rename(columns={'date':'most_recent_order','value': 'total_orders',}, inplace=True) #Renamed two of the columns
df_new.head(10) #Display the first 10 rows

In [None]:
#Examined the same customer id's to make sure the grouping and calculation worked correctly
df_new[(df_new['customer_id']==8658) | (df_new['customer_id']==5989)]

## B) Plot the count of orders per week for the store

In [None]:
#Create a separate dataframe of dates and sales
sales = df[['date','value']]

In [None]:
#Set the date to be the index of the new dataframe and sort it by the date
sales.set_index('date', inplace=True)
sales.sort_index(inplace=True)

In [None]:
#Examine the new dataframe
sales.head(10)

In [None]:
#Set the dataframe to show sales by week
sales = sales.resample('W').sum()

In [None]:
#Examine the dataframe after setting the index and grouping sales by week
sales.head()

In [None]:
#Used a function to show the value columns with commas
def func(x, pos):  
   s = '{:0,d}'.format(int(x))
   return s

y_format = tkr.FuncFormatter(func)

plt.figure(figsize=(12,9))
plt.grid(b=None)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.ylabel('Sales($)', fontsize=25)
plt.title('Weekly Order Count',fontsize=20)
ax = plt.subplot(111)
ax.plot(sales['value'],linewidth=5,color='red');
ax.yaxis.set_major_formatter(y_format)
plt.show()

## C) Compute the mean order value for gender 0 and for gender 1. Do you think the difference is significant?

In [None]:
#Grouped by gender and calculated the mean of the value column, rounded to two places
pd.DataFrame(round(df.groupby('gender')['value'].mean(),2))

In [None]:
#Calculated the t-statistic and p-value to check for signifigance
stats.ttest_ind(df[df['gender']==0]['value'],df[df['gender']==1]['value'])

The p-value is 0.048.  Using an alpha (significance level) of 0.05, I do not think that there is  a significant difference in the mean order values for the two genders.

# D) Assuming a single gender prediction was made for each customer, generate a confusion matrix for predicted gender. What does the confusion matrix tell you about the quality of the predictions?

In [None]:
cm = confusion_matrix(df['gender'],df['predicted_gender'])
tn, fp, fn, tp = confusion_matrix(df['gender'],df['predicted_gender']).ravel()

In [None]:
print('True 0: ', tn)
print('False 1: ', fp)
print('False 0: ', fn)
print('True 1: ', tp)

In [None]:
#Classification metrics
accuracy = round((((tp+tn)/(tp+fn+tn+fp))*100),2)
misclassification = round(((1 - (accuracy/100))*100),2)
sensitivy = round(((tp/(tp+fn))*100),2)
specificity = round(((tn/(tn+fp))*100),2)
precision = round(((tp/(tp+fp))*100),2)
print('Accuracy Rate:',accuracy,'%')
print('Misclassification Rate:',misclassification,'%')
print('Sensitivy Rate:',sensitivy,'%')
print('Specificity Rate:',specificity,'%')
print('Precision Rate:',precision,'%')

In [None]:
cm_df = pd.DataFrame(cm, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])
cm_df

In [None]:
#Check value counts for each of the genders
pd.DataFrame(df['gender'].value_counts())

The accuracy score is about 64%.  Depending on your tolerance this may be a good score.  If possible, I might prefer to see a score closer to 80%.  Looking at the confusion matrix table, it appears that the model does a better job of predicting gender class 1 than it does predicting class 0.  The model correctly predicts class 0 about 50% of the time while it correctly predicts class 1 about 78% of the time.  I would want to further investigate why the model is better at predicting one class over the other especially because the classes are roughly balanced (there is very little bias to one class over the other).