In [1]:
import pandas as pd

### Have a quick preview of the 5 'restaurant' csv files (most files are 'normalised' ie contain category
### relevant information)

In [2]:
pd.read_csv('Restaurant - Customers.csv').head()

Unnamed: 0,ID,First Name,Last Name,Gender,Company,Occupation
0,1,Joseph,Perkins,Male,Dynazzy,Community Outreach Specialist
1,2,Jennifer,Alvarez,Female,DabZ,Senior Quality Engineer
2,3,Roger,Black,Male,Tagfeed,Account Executive
3,4,Steven,Evans,Male,Fatz,Registered Nurse
4,5,Judy,Morrison,Female,Demivee,Legal Assistant


In [3]:
pd.read_csv('Restaurant - Foods.csv').head()

Unnamed: 0,Food ID,Food Item,Price
0,1,Sushi,3.99
1,2,Burrito,9.99
2,3,Taco,2.99
3,4,Quesadilla,4.25
4,5,Pizza,2.49


In [4]:
pd.read_csv('Restaurant - Week 1 Sales.csv').head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [5]:
pd.read_csv('Restaurant - Week 1 Satisfaction.csv').head()

Unnamed: 0,Satisfaction Rating
0,2
1,7
2,3
3,7
4,10


In [6]:
pd.read_csv('Restaurant - Week 2 Sales.csv').head()

Unnamed: 0,Customer ID,Food ID
0,688,10
1,813,7
2,495,10
3,189,5
4,267,3


### Load week1 and week2 customer data on what food had been purchased

In [7]:
week1 = pd.read_csv('Restaurant - Week 1 Sales.csv')
week2 = pd.read_csv('Restaurant - Week 2 Sales.csv')

### Quick overview (all data is non-null)

In [8]:
week1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
Customer ID    250 non-null int64
Food ID        250 non-null int64
dtypes: int64(2)
memory usage: 4.0 KB


In [9]:
week2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 2 columns):
Customer ID    250 non-null int64
Food ID        250 non-null int64
dtypes: int64(2)
memory usage: 4.0 KB


### Unique customer IDs to gauge how much of business was repeated (minimal repeate business)
### Unique Food IDs to gauge what sort of data diversity we are dealing with (only 10 unique foods)

In [11]:
week1['Customer ID'].nunique()

221

In [12]:
week2['Customer ID'].nunique()

224

In [13]:
week1['Food ID'].nunique()

10

In [14]:
week2['Food ID'].nunique()

10

### Customers master table, 1,000 unique

In [15]:
customers = pd.read_csv('Restaurant - Customers.csv')

In [16]:
customers['ID'].nunique()

1000

### Foods, as predicted above, 10 food IDs, small table

In [17]:
foods = pd.read_csv('Restaurant - Foods.csv')

In [18]:
foods['Food ID'].nunique()

10

### Concatenate 2 weeks' worth of customer data on food purchase, ignore original index

In [26]:
weeks = pd.concat([week1, week2], ignore_index = True)

### 500 rows as expected

In [27]:
weeks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
Customer ID    500 non-null int64
Food ID        500 non-null int64
dtypes: int64(2)
memory usage: 7.9 KB


### Concatenate 2 weeks' worth of customer data while maintaining info on file origin

In [35]:
week1_week2 = pd.concat([week1, week2], keys = ['week 1', 'week 2'], ignore_index = False)

In [36]:
week1_week2.head()

Unnamed: 0,Unnamed: 1,Customer ID,Food ID
week 1,0,537,9
week 1,1,97,4
week 1,2,658,1
week 1,3,202,2
week 1,4,155,9


In [None]:
### I can easily index only week1's data for example

In [39]:
week1_week2.ix['week 1'].head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [40]:
week1_week2.ix[('week 2', 200)]

Customer ID    751
Food ID          2
Name: (week 2, 200), dtype: int64

In [None]:
### I can get a tuple by requesting various columns for a particular indexed row. I could end up using this eventually
### in a dictionary for example as a key to see how frequently a partiuclar customer buys a particular type of food.

In [43]:
week1_week2.ix[('week 2', 200), 'Food ID'], week1_week2.ix[('week 2', 200), 'Customer ID']

(2, 751)

In [45]:
dict_ = {(week1_week2.ix[('week 2', 200), 'Food ID'], week1_week2.ix[('week 2', 200), 'Customer ID']) : 0 }

In [46]:
dict_

{(2, 751): 0}

### Using 'append' to merge two files as before (I'm not saving it to a variable)

In [110]:
week1.append(week2, ignore_index = True).head()


Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


# Join files

## Inner Join, customers that purchased food products both in week 1 and week 2

In [145]:
week1.merge(week2, how = 'inner', on = 'Customer ID', suffixes = ('_1', '_2')).head()

Unnamed: 0,Customer ID,Food ID_1,Food ID_2
0,537,9,5
1,155,9,3
2,155,1,3
3,503,5,8
4,503,5,9


## Inner Join on multiple columns: customers that purchased both in week1 and week2 the same foods

In [147]:
week1.merge(week2, how = 'inner', on = ['Customer ID', 'Food ID'])

Unnamed: 0,Customer ID,Food ID
0,304,3
1,540,3
2,937,10
3,233,3
4,21,4
5,21,4
6,922,1
7,578,5
8,578,5


### Why are some Cusomter IDs appearing more than once above? 

In [148]:
week1[week1['Customer ID'] == 21] ### Came in week 1 more than once, two occurences

Unnamed: 0,Customer ID,Food ID
101,21,4
212,21,4


In [151]:
week2[week2['Customer ID'] == 21]

Unnamed: 0,Customer ID,Food ID
30,21,4


In [149]:
week1[week1['Customer ID'] == 578]

Unnamed: 0,Customer ID,Food ID
224,578,5


In [150]:
week2[week2['Customer ID'] ==578] ### Came in week 2 more than once, two occurences.

Unnamed: 0,Customer ID,Food ID
29,578,5
189,578,5


## Outer Joins 

### Full Outer Join

In [157]:
week1.merge(week2, how = 'outer', on = 'Customer ID', suffixes = ('_week1', '_week2'), indicator = True).head() # NaN means couldn't find
# in particular week

Unnamed: 0,Customer ID,Food ID_week1,Food ID_week2,_merge
0,537.0,9.0,5.0,both
1,97.0,4.0,,left_only
2,658.0,1.0,,left_only
3,202.0,2.0,,left_only
4,155.0,9.0,3.0,both


In [160]:
week2.merge(week1, how = 'outer', on = 'Customer ID', suffixes = ('_week2','_week1'), indicator = True).head()

Unnamed: 0,Customer ID,Food ID_week2,Food ID_week1,_merge
0,3.0,,2.0,right_only
1,8.0,6.0,,left_only
2,10.0,,2.0,right_only
3,13.0,2.0,,left_only
4,20.0,,1.0,right_only


In [162]:
merged = week2.merge(week1, how = 'outer', on = 'Customer ID', suffixes = ('_week2', '_week1'), indicator = True)

## #Take a look at '_merge' distribution

In [164]:
merged['_merge'].value_counts() # 197 unique Customer IDs week2, 195 unique Customer IDs in week1

left_only     197
right_only    195
both           62
Name: _merge, dtype: int64

### Outer Join for unique Customer IDs

In [169]:
mask = merged['_merge'].isin(['left_only', 'right_only'])
merged[mask].head(10)  ### Unique Customer IDs that only exist in week1 or week2

Unnamed: 0,Customer ID,Food ID_week2,Food ID_week1,_merge
0,688.0,10.0,,left_only
1,813.0,7.0,,left_only
2,495.0,10.0,,left_only
3,495.0,6.0,,left_only
4,495.0,2.0,,left_only
5,495.0,2.0,,left_only
7,267.0,3.0,,left_only
12,443.0,5.0,,left_only
13,729.0,9.0,,left_only
15,847.0,6.0,,left_only


## Left Join (closes to vlookup) 

In [None]:
## We focus on one particular DataFrame and see whether their values are also in another. 
###The other dataset only matters if we can match rows to the one we are focusing on.

In [170]:
week1.head()

Unnamed: 0,Customer ID,Food ID
0,537,9
1,97,4
2,658,1
3,202,2
4,155,9


In [171]:
foods.head()

Unnamed: 0,Food ID,Food Item,Price
0,1,Sushi,3.99
1,2,Burrito,9.99
2,3,Taco,2.99
3,4,Quesadilla,4.25
4,5,Pizza,2.49


In [176]:
week1.merge(foods, how = 'left', on = 'Food ID', sort = True).head(20)

Unnamed: 0,Customer ID,Food ID,Food Item,Price
0,658,1,Sushi,3.99
1,600,1,Sushi,3.99
2,155,1,Sushi,3.99
3,341,1,Sushi,3.99
4,20,1,Sushi,3.99
5,77,1,Sushi,3.99
6,100,1,Sushi,3.99
7,953,1,Sushi,3.99
8,504,1,Sushi,3.99
9,323,1,Sushi,3.99


In [175]:
foods.merge(week1, how = 'left', on = 'Food ID').head(20)

Unnamed: 0,Food ID,Food Item,Price,Customer ID
0,1,Sushi,3.99,658
1,1,Sushi,3.99,600
2,1,Sushi,3.99,155
3,1,Sushi,3.99,341
4,1,Sushi,3.99,20
5,1,Sushi,3.99,77
6,1,Sushi,3.99,100
7,1,Sushi,3.99,953
8,1,Sushi,3.99,504
9,1,Sushi,3.99,323
