In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.color_palette("tab10")
from scipy import stats
from sklearn.model_selection import train_test_split
import os

Dataset from: https://www.kaggle.com/datasets/thedevastator/analyzing-customer-spending-habits-to-improve-sa?resource=download

In [2]:
def acquire_data():
    '''
    Checks for a local cache of tsa_store_data.csv and if not present will run the get_store_data() function which acquires data from Codeup's mysql server
    '''
    filename = 'SalesForCourse_quizz_table.csv'
    if os.path.isfile(filename):
        df = pd.read_csv(filename, index_col=False)
        return df
    else:
        print('Data Not Found')
        return df

In [3]:
df = acquire_data()
df

Unnamed: 0,index,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Column1
0,0,02/19/16,2016.0,February,29.0,F,United States,Washington,Accessories,Tires and Tubes,1.0,80.00,109.000000,80.0,109.000000,
1,1,02/20/16,2016.0,February,29.0,F,United States,Washington,Clothing,Gloves,2.0,24.50,28.500000,49.0,57.000000,
2,2,02/27/16,2016.0,February,29.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,3.67,5.000000,11.0,15.000000,
3,3,03/12/16,2016.0,March,29.0,F,United States,Washington,Accessories,Tires and Tubes,2.0,87.50,116.500000,175.0,233.000000,
4,4,03/12/16,2016.0,March,29.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,35.00,41.666667,105.0,125.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34862,34862,02/07/16,2016.0,February,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,2.0,1160.00,985.500000,2320.0,1971.000000,
34863,34863,03/13/15,2015.0,March,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,1.0,2049.00,1583.000000,2049.0,1583.000000,
34864,34864,04/05/15,2015.0,April,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,3.0,683.00,560.666667,2049.0,1682.000000,
34865,34865,08/30/15,2015.0,August,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,1.0,2320.00,1568.000000,2320.0,1568.000000,


In [4]:
df.describe()

Unnamed: 0,index,Year,Customer Age,Quantity,Unit Cost,Unit Price,Cost,Revenue,Column1
count,34867.0,34866.0,34866.0,34866.0,34866.0,34866.0,34866.0,34867.0,2574.0
mean,17433.0,2015.569237,36.382895,2.002524,349.880567,389.232485,576.004532,640.870093,688.054913
std,10065.380254,0.49519,11.112902,0.813936,490.015846,525.319091,690.500395,736.640033,774.200897
min,0.0,2015.0,17.0,1.0,0.67,0.666667,2.0,2.0,2.0
25%,8716.5,2015.0,28.0,1.0,45.0,53.666667,85.0,102.0,104.25
50%,17433.0,2016.0,35.0,2.0,150.0,179.0,261.0,319.0,390.5
75%,26149.5,2016.0,44.0,3.0,455.0,521.0,769.0,902.0,975.75
max,34866.0,2016.0,87.0,3.0,3240.0,5082.0,3600.0,5082.0,3681.0


In [5]:
df.nunique()

index               34867
Date                  576
Year                    2
Month                  12
Customer Age           70
Customer Gender         2
Country                 4
State                  45
Product Category        3
Sub Category           17
Quantity                3
Unit Cost             882
Unit Price           5175
Cost                  417
Revenue              3023
Column1              1281
dtype: int64

In [6]:
df[df['Date'].isnull()]

Unnamed: 0,index,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Column1
34866,34866,,,,,,,,,,,,,,641.532095,


In [7]:
df=df.dropna(subset=['Date'])


In [8]:
df=df.drop('Column1', axis=1)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34866 entries, 0 to 34865
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             34866 non-null  int64  
 1   Date              34866 non-null  object 
 2   Year              34866 non-null  float64
 3   Month             34866 non-null  object 
 4   Customer Age      34866 non-null  float64
 5   Customer Gender   34866 non-null  object 
 6   Country           34866 non-null  object 
 7   State             34866 non-null  object 
 8   Product Category  34866 non-null  object 
 9   Sub Category      34866 non-null  object 
 10  Quantity          34866 non-null  float64
 11  Unit Cost         34866 non-null  float64
 12  Unit Price        34866 non-null  float64
 13  Cost              34866 non-null  float64
 14  Revenue           34866 non-null  float64
dtypes: float64(7), int64(1), object(7)
memory usage: 4.3+ MB


In [10]:
# convert date to datetime
df["Date"] = pd.to_datetime(df["Date"])

In [11]:
# create year and month column
df['Year_Month'] = df['Date'].dt.strftime('%Y-%m')

In [12]:
# which countries am i working with?
print(df['Country'].unique())


['United States' 'France' 'United Kingdom' 'Germany']


In [13]:
# create column Margin
df['Margin']=df['Revenue']-df['Cost']

In [14]:
# created column Unit-Margin
df['Unit_Margin']=df['Unit Price']-df['Unit Cost']


In [15]:
df

Unnamed: 0,index,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
0,0,2016-02-19,2016.0,February,29.0,F,United States,Washington,Accessories,Tires and Tubes,1.0,80.00,109.000000,80.0,109.0,2016-02,29.0,29.000000
1,1,2016-02-20,2016.0,February,29.0,F,United States,Washington,Clothing,Gloves,2.0,24.50,28.500000,49.0,57.0,2016-02,8.0,4.000000
2,2,2016-02-27,2016.0,February,29.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,3.67,5.000000,11.0,15.0,2016-02,4.0,1.330000
3,3,2016-03-12,2016.0,March,29.0,F,United States,Washington,Accessories,Tires and Tubes,2.0,87.50,116.500000,175.0,233.0,2016-03,58.0,29.000000
4,4,2016-03-12,2016.0,March,29.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,35.00,41.666667,105.0,125.0,2016-03,20.0,6.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34861,34861,2015-03-22,2015.0,March,38.0,M,France,Charente-Maritime,Bikes,Mountain Bikes,1.0,2049.00,1487.000000,2049.0,1487.0,2015-03,-562.0,-562.000000
34862,34862,2016-02-07,2016.0,February,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,2.0,1160.00,985.500000,2320.0,1971.0,2016-02,-349.0,-174.500000
34863,34863,2015-03-13,2015.0,March,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,1.0,2049.00,1583.000000,2049.0,1583.0,2015-03,-466.0,-466.000000
34864,34864,2015-04-05,2015.0,April,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,3.0,683.00,560.666667,2049.0,1682.0,2015-04,-367.0,-122.333333


In [16]:
df = df.drop('index', axis=1)


In [17]:
df

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
0,2016-02-19,2016.0,February,29.0,F,United States,Washington,Accessories,Tires and Tubes,1.0,80.00,109.000000,80.0,109.0,2016-02,29.0,29.000000
1,2016-02-20,2016.0,February,29.0,F,United States,Washington,Clothing,Gloves,2.0,24.50,28.500000,49.0,57.0,2016-02,8.0,4.000000
2,2016-02-27,2016.0,February,29.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,3.67,5.000000,11.0,15.0,2016-02,4.0,1.330000
3,2016-03-12,2016.0,March,29.0,F,United States,Washington,Accessories,Tires and Tubes,2.0,87.50,116.500000,175.0,233.0,2016-03,58.0,29.000000
4,2016-03-12,2016.0,March,29.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,35.00,41.666667,105.0,125.0,2016-03,20.0,6.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34861,2015-03-22,2015.0,March,38.0,M,France,Charente-Maritime,Bikes,Mountain Bikes,1.0,2049.00,1487.000000,2049.0,1487.0,2015-03,-562.0,-562.000000
34862,2016-02-07,2016.0,February,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,2.0,1160.00,985.500000,2320.0,1971.0,2016-02,-349.0,-174.500000
34863,2015-03-13,2015.0,March,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,1.0,2049.00,1583.000000,2049.0,1583.0,2015-03,-466.0,-466.000000
34864,2015-04-05,2015.0,April,38.0,M,France,Hauts de Seine,Bikes,Mountain Bikes,3.0,683.00,560.666667,2049.0,1682.0,2015-04,-367.0,-122.333333


In [18]:
df = df.sort_values('Date')

In [19]:
df = df.reset_index(drop=True)

In [20]:
df

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
0,2015-01-01,2015.0,January,37.0,F,Germany,Hamburg,Bikes,Road Bikes,2.0,1091.00,1272.500000,2182.0,2545.0,2015-01,363.0,181.500000
1,2015-01-01,2015.0,January,19.0,F,United States,Washington,Bikes,Road Bikes,1.0,1000.00,938.000000,1000.0,938.0,2015-01,-62.0,-62.000000
2,2015-01-01,2015.0,January,29.0,F,United States,Florida,Bikes,Road Bikes,3.0,261.00,230.666667,783.0,692.0,2015-01,-91.0,-30.333333
3,2015-01-01,2015.0,January,29.0,M,United States,Oregon,Bikes,Road Bikes,3.0,261.00,233.000000,783.0,699.0,2015-01,-84.0,-28.000000
4,2015-01-01,2015.0,January,36.0,F,United States,California,Bikes,Mountain Bikes,2.0,1035.50,896.000000,2071.0,1792.0,2015-01,-279.0,-139.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34861,2016-07-31,2016.0,July,51.0,M,Germany,Saarland,Clothing,Socks,3.0,57.00,79.000000,171.0,237.0,2016-07,66.0,22.000000
34862,2016-07-31,2016.0,July,42.0,F,United States,California,Accessories,Tires and Tubes,3.0,13.33,15.000000,40.0,45.0,2016-07,5.0,1.670000
34863,2016-07-31,2016.0,July,40.0,F,United States,Washington,Accessories,Tires and Tubes,1.0,60.00,64.000000,60.0,64.0,2016-07,4.0,4.000000
34864,2016-07-31,2016.0,July,36.0,F,United States,Washington,Accessories,Tires and Tubes,2.0,4.50,5.500000,9.0,11.0,2016-07,2.0,1.000000


In [21]:
# defining proportion of our split
train_size = 0.7


In [22]:
train_index = round(train_size * df.shape[0])

In [23]:
train_index


24406

In [24]:
df

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
0,2015-01-01,2015.0,January,37.0,F,Germany,Hamburg,Bikes,Road Bikes,2.0,1091.00,1272.500000,2182.0,2545.0,2015-01,363.0,181.500000
1,2015-01-01,2015.0,January,19.0,F,United States,Washington,Bikes,Road Bikes,1.0,1000.00,938.000000,1000.0,938.0,2015-01,-62.0,-62.000000
2,2015-01-01,2015.0,January,29.0,F,United States,Florida,Bikes,Road Bikes,3.0,261.00,230.666667,783.0,692.0,2015-01,-91.0,-30.333333
3,2015-01-01,2015.0,January,29.0,M,United States,Oregon,Bikes,Road Bikes,3.0,261.00,233.000000,783.0,699.0,2015-01,-84.0,-28.000000
4,2015-01-01,2015.0,January,36.0,F,United States,California,Bikes,Mountain Bikes,2.0,1035.50,896.000000,2071.0,1792.0,2015-01,-279.0,-139.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34861,2016-07-31,2016.0,July,51.0,M,Germany,Saarland,Clothing,Socks,3.0,57.00,79.000000,171.0,237.0,2016-07,66.0,22.000000
34862,2016-07-31,2016.0,July,42.0,F,United States,California,Accessories,Tires and Tubes,3.0,13.33,15.000000,40.0,45.0,2016-07,5.0,1.670000
34863,2016-07-31,2016.0,July,40.0,F,United States,Washington,Accessories,Tires and Tubes,1.0,60.00,64.000000,60.0,64.0,2016-07,4.0,4.000000
34864,2016-07-31,2016.0,July,36.0,F,United States,Washington,Accessories,Tires and Tubes,2.0,4.50,5.500000,9.0,11.0,2016-07,2.0,1.000000


In [25]:
train = df.reset_index(drop=True)[:train_index]
test = df.reset_index(drop=True)[train_index:]

In [26]:
train.head()

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
0,2015-01-01,2015.0,January,37.0,F,Germany,Hamburg,Bikes,Road Bikes,2.0,1091.0,1272.5,2182.0,2545.0,2015-01,363.0,181.5
1,2015-01-01,2015.0,January,19.0,F,United States,Washington,Bikes,Road Bikes,1.0,1000.0,938.0,1000.0,938.0,2015-01,-62.0,-62.0
2,2015-01-01,2015.0,January,29.0,F,United States,Florida,Bikes,Road Bikes,3.0,261.0,230.666667,783.0,692.0,2015-01,-91.0,-30.333333
3,2015-01-01,2015.0,January,29.0,M,United States,Oregon,Bikes,Road Bikes,3.0,261.0,233.0,783.0,699.0,2015-01,-84.0,-28.0
4,2015-01-01,2015.0,January,36.0,F,United States,California,Bikes,Mountain Bikes,2.0,1035.5,896.0,2071.0,1792.0,2015-01,-279.0,-139.5


In [27]:
train.tail()

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
24401,2016-04-09,2016.0,April,28.0,F,France,Essonne,Accessories,Bottles and Cages,3.0,45.0,73.666667,135.0,221.0,2016-04,86.0,28.666667
24402,2016-04-09,2016.0,April,45.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,26.67,31.0,80.0,93.0,2016-04,13.0,4.33
24403,2016-04-09,2016.0,April,28.0,F,France,Essonne,Accessories,Bottles and Cages,2.0,58.5,79.0,117.0,158.0,2016-04,41.0,20.5
24404,2016-04-09,2016.0,April,28.0,F,France,Essonne,Clothing,Jerseys,3.0,234.0,306.333333,702.0,919.0,2016-04,217.0,72.333333
24405,2016-04-09,2016.0,April,42.0,M,Germany,Nordrhein-Westfalen,Bikes,Mountain Bikes,1.0,2320.0,3263.0,2320.0,3263.0,2016-04,943.0,943.0


In [28]:
test.head()

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
24406,2016-04-09,2016.0,April,44.0,F,United States,California,Clothing,Jerseys,1.0,594.0,684.0,594.0,684.0,2016-04,90.0,90.0
24407,2016-04-09,2016.0,April,29.0,M,United Kingdom,England,Bikes,Mountain Bikes,3.0,765.0,809.333333,2295.0,2428.0,2016-04,133.0,44.333333
24408,2016-04-09,2016.0,April,36.0,F,United States,Washington,Clothing,Caps,2.0,9.0,11.0,18.0,22.0,2016-04,4.0,2.0
24409,2016-04-09,2016.0,April,17.0,F,United States,California,Clothing,Caps,3.0,45.0,51.0,135.0,153.0,2016-04,18.0,6.0
24410,2016-04-09,2016.0,April,31.0,M,United States,Washington,Clothing,Caps,1.0,216.0,282.0,216.0,282.0,2016-04,66.0,66.0


In [29]:
test.tail()

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
34861,2016-07-31,2016.0,July,51.0,M,Germany,Saarland,Clothing,Socks,3.0,57.0,79.0,171.0,237.0,2016-07,66.0,22.0
34862,2016-07-31,2016.0,July,42.0,F,United States,California,Accessories,Tires and Tubes,3.0,13.33,15.0,40.0,45.0,2016-07,5.0,1.67
34863,2016-07-31,2016.0,July,40.0,F,United States,Washington,Accessories,Tires and Tubes,1.0,60.0,64.0,60.0,64.0,2016-07,4.0,4.0
34864,2016-07-31,2016.0,July,36.0,F,United States,Washington,Accessories,Tires and Tubes,2.0,4.5,5.5,9.0,11.0,2016-07,2.0,1.0
34865,2016-07-31,2016.0,July,23.0,M,Germany,Saarland,Accessories,Tires and Tubes,2.0,32.0,47.0,64.0,94.0,2016-07,30.0,15.0


In [30]:
train = train.set_index('Date')
test = test.set_index('Date')

In [31]:
train.tail()

Unnamed: 0_level_0,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-04-09,2016.0,April,28.0,F,France,Essonne,Accessories,Bottles and Cages,3.0,45.0,73.666667,135.0,221.0,2016-04,86.0,28.666667
2016-04-09,2016.0,April,45.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,26.67,31.0,80.0,93.0,2016-04,13.0,4.33
2016-04-09,2016.0,April,28.0,F,France,Essonne,Accessories,Bottles and Cages,2.0,58.5,79.0,117.0,158.0,2016-04,41.0,20.5
2016-04-09,2016.0,April,28.0,F,France,Essonne,Clothing,Jerseys,3.0,234.0,306.333333,702.0,919.0,2016-04,217.0,72.333333
2016-04-09,2016.0,April,42.0,M,Germany,Nordrhein-Westfalen,Bikes,Mountain Bikes,1.0,2320.0,3263.0,2320.0,3263.0,2016-04,943.0,943.0


In [32]:
test.head()

Unnamed: 0_level_0,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-04-09,2016.0,April,44.0,F,United States,California,Clothing,Jerseys,1.0,594.0,684.0,594.0,684.0,2016-04,90.0,90.0
2016-04-09,2016.0,April,29.0,M,United Kingdom,England,Bikes,Mountain Bikes,3.0,765.0,809.333333,2295.0,2428.0,2016-04,133.0,44.333333
2016-04-09,2016.0,April,36.0,F,United States,Washington,Clothing,Caps,2.0,9.0,11.0,18.0,22.0,2016-04,4.0,2.0
2016-04-09,2016.0,April,17.0,F,United States,California,Clothing,Caps,3.0,45.0,51.0,135.0,153.0,2016-04,18.0,6.0
2016-04-09,2016.0,April,31.0,M,United States,Washington,Clothing,Caps,1.0,216.0,282.0,216.0,282.0,2016-04,66.0,66.0


In [33]:
def wrangle():
    ''' 
* Drops any rows which contain null values in the 'Date' column using the dropna() function with the subset parameter set to 'Date'.
* Drops a column called 'Column1' using the drop() function with axis set to 1.
* Converts the 'Date' column to datetime format using the to_datetime() function from the pandas library.
* Creates a new column called 'Year_Month' which extracts the year and month from the 'Date' column using the dt.strftime() method.
* Creates two new columns called 'Margin' and 'Unit_Margin', both of which are calculated by performing arithmetic operations on existing columns in the 'df' dataframe.
* Drops a column called 'index' using the drop() function with axis set to 1 and returns the modified 'df' dataframe.
* Sorts Data by Date   
* Resets the Index'''
    df = acquire_data()
    
    df=df.dropna(subset=['Date'])
    
    df=df.drop('Column1', axis=1)
    
    df["Date"] = pd.to_datetime(df["Date"])
    
    df['Year_Month'] = df['Date'].dt.strftime('%Y-%m')
    
    df['Margin']=df['Revenue']-df['Cost']
    
    df['Unit_Margin']=df['Unit Price']-df['Unit Cost']
    
    df = df.drop('index', axis=1)
    
    df = df.sort_values('Date')
    
    df = df.reset_index(drop=True)
    return df
    

In [34]:
wrangle()

Unnamed: 0,Date,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
0,2015-01-01,2015.0,January,37.0,F,Germany,Hamburg,Bikes,Road Bikes,2.0,1091.00,1272.500000,2182.0,2545.0,2015-01,363.0,181.500000
1,2015-01-01,2015.0,January,19.0,F,United States,Washington,Bikes,Road Bikes,1.0,1000.00,938.000000,1000.0,938.0,2015-01,-62.0,-62.000000
2,2015-01-01,2015.0,January,29.0,F,United States,Florida,Bikes,Road Bikes,3.0,261.00,230.666667,783.0,692.0,2015-01,-91.0,-30.333333
3,2015-01-01,2015.0,January,29.0,M,United States,Oregon,Bikes,Road Bikes,3.0,261.00,233.000000,783.0,699.0,2015-01,-84.0,-28.000000
4,2015-01-01,2015.0,January,36.0,F,United States,California,Bikes,Mountain Bikes,2.0,1035.50,896.000000,2071.0,1792.0,2015-01,-279.0,-139.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34861,2016-07-31,2016.0,July,51.0,M,Germany,Saarland,Clothing,Socks,3.0,57.00,79.000000,171.0,237.0,2016-07,66.0,22.000000
34862,2016-07-31,2016.0,July,42.0,F,United States,California,Accessories,Tires and Tubes,3.0,13.33,15.000000,40.0,45.0,2016-07,5.0,1.670000
34863,2016-07-31,2016.0,July,40.0,F,United States,Washington,Accessories,Tires and Tubes,1.0,60.00,64.000000,60.0,64.0,2016-07,4.0,4.000000
34864,2016-07-31,2016.0,July,36.0,F,United States,Washington,Accessories,Tires and Tubes,2.0,4.50,5.500000,9.0,11.0,2016-07,2.0,1.000000


In [35]:
def train_test_split():
    # defining proportion of our split
    train_size = 0.7
    df = wrangle()
    train_index = round(train_size * df.shape[0])
    train = df.reset_index(drop=True)[:train_index]
    test = df.reset_index(drop=True)[train_index:]
    train = train.set_index('Date')
    test = test.set_index('Date')
    return train, test

In [36]:
train, test = train_test_split()

In [37]:
train

Unnamed: 0_level_0,Year,Month,Customer Age,Customer Gender,Country,State,Product Category,Sub Category,Quantity,Unit Cost,Unit Price,Cost,Revenue,Year_Month,Margin,Unit_Margin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2015-01-01,2015.0,January,37.0,F,Germany,Hamburg,Bikes,Road Bikes,2.0,1091.00,1272.500000,2182.0,2545.0,2015-01,363.0,181.500000
2015-01-01,2015.0,January,19.0,F,United States,Washington,Bikes,Road Bikes,1.0,1000.00,938.000000,1000.0,938.0,2015-01,-62.0,-62.000000
2015-01-01,2015.0,January,29.0,F,United States,Florida,Bikes,Road Bikes,3.0,261.00,230.666667,783.0,692.0,2015-01,-91.0,-30.333333
2015-01-01,2015.0,January,29.0,M,United States,Oregon,Bikes,Road Bikes,3.0,261.00,233.000000,783.0,699.0,2015-01,-84.0,-28.000000
2015-01-01,2015.0,January,36.0,F,United States,California,Bikes,Mountain Bikes,2.0,1035.50,896.000000,2071.0,1792.0,2015-01,-279.0,-139.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-04-09,2016.0,April,28.0,F,France,Essonne,Accessories,Bottles and Cages,3.0,45.00,73.666667,135.0,221.0,2016-04,86.0,28.666667
2016-04-09,2016.0,April,45.0,F,United States,Washington,Accessories,Tires and Tubes,3.0,26.67,31.000000,80.0,93.0,2016-04,13.0,4.330000
2016-04-09,2016.0,April,28.0,F,France,Essonne,Accessories,Bottles and Cages,2.0,58.50,79.000000,117.0,158.0,2016-04,41.0,20.500000
2016-04-09,2016.0,April,28.0,F,France,Essonne,Clothing,Jerseys,3.0,234.00,306.333333,702.0,919.0,2016-04,217.0,72.333333
