# GetGround Data Task

## Goal: 
- to insert this data into a SQL database on your local machine; 
- run some SQL queries and analysis; 
- document, explain and visualize your response to the questions asked.

## Workflow:

- 1. Understand the data
- 2. Load raw data (csvs) into postgres 
- 3. Install dbt and connect to postgres ( Docker ?)
- 4. Create models
- 4. Analyze the data
- 5. Report

## Imports and setting

In [1]:
import pandas as pd
import numpy as np
import datetime

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# load partners
partners = pd.read_csv('/home/developer/Desktop/testes/tech-tests/GetGround_Analytics_Engineer_Data/partners.csv')
partners.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522 entries, 0 to 521
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  522 non-null    int64  
 1   created_at          522 non-null    float64
 2   updated_at          522 non-null    float64
 3   partner_type        522 non-null    object 
 4   lead_sales_contact  522 non-null    object 
dtypes: float64(2), int64(1), object(2)
memory usage: 20.5+ KB


In [3]:
partners.head()

Unnamed: 0,id,created_at,updated_at,partner_type,lead_sales_contact
0,2,1.598856e+18,1.607047e+18,Agent,Potato
1,3,1.598859e+18,1.618882e+18,Agent,Lion
2,4,1.598859e+18,1.616642e+18,Agent,Potato
3,5,1.59886e+18,1.607331e+18,Agent,Lion
4,6,1.598866e+18,1.609743e+18,Agent,Potato


In [4]:
# load referrals
referrals = pd.read_csv('/home/developer/Desktop/testes/tech-tests/GetGround_Analytics_Engineer_Data/referrals.csv')
referrals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1470 non-null   int64  
 1   created_at     1470 non-null   float64
 2   updated_at     1470 non-null   float64
 3   company_id     1470 non-null   int64  
 4   partner_id     1470 non-null   int64  
 5   consultant_id  1470 non-null   int64  
 6   status         1470 non-null   object 
 7   is_outbound    1470 non-null   int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 92.0+ KB


In [6]:
referrals.head()

Unnamed: 0,id,created_at,updated_at,company_id,partner_id,consultant_id,status,is_outbound
0,1,1.598956e+18,1.598956e+18,385,4,4,successful,0
1,2,1.599105e+18,1.599105e+18,390,7,8,successful,0
2,3,1.599105e+18,1.599105e+18,387,7,8,successful,0
3,4,1.599105e+18,1.599105e+18,385,7,8,successful,0
4,5,1.599106e+18,1.599106e+18,331,8,9,successful,0


In [8]:
referrals.status.unique()

array(['successful', 'disinterested', 'pending'], dtype=object)

In [9]:
referrals.columns

Index(['id', 'created_at', 'updated_at', 'company_id', 'partner_id',
       'consultant_id', 'status', 'is_outbound'],
      dtype='object')

In [10]:
# load sales_people
sales_people = pd.read_csv('/home/developer/Desktop/testes/tech-tests/GetGround_Analytics_Engineer_Data/sales_people.csv')
sales_people.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   country  12 non-null     object
dtypes: object(2)
memory usage: 320.0+ bytes


In [11]:
sales_people.columns

Index(['name', 'country'], dtype='object')

In [49]:
sales_people.head()

Unnamed: 0,name,country
0,Orange,Singapore
1,Apple,Singapore
2,Lion,HongKong
3,Tree,HongKong
4,Root,HongKong


In [51]:
sales_people.name.unique()

array(['Orange', 'Apple', 'Lion', 'Tree', 'Root', 'Sky', 'Cloud', 'Horiz',
       'Leaf', 'Daisy', 'Tulip', 'Fig'], dtype=object)

In [52]:
sales_people.country.unique()

array(['Singapore', 'HongKong', 'UK'], dtype=object)

In [13]:
# Lead_sales people = names

In [17]:
list1 = list(sales_people.name.unique())
list1.sort()
list1

['Apple',
 'Cloud',
 'Daisy',
 'Fig',
 'Horiz',
 'Leaf',
 'Lion',
 'Orange',
 'Root',
 'Sky',
 'Tree',
 'Tulip']

In [18]:
list2 = list(partners.lead_sales_contact.unique())
list2.sort()
list2

# temos 0

['0',
 'Apple',
 'Cloud',
 'Daisy',
 'Fig',
 'Horiz',
 'Leaf',
 'Lion',
 'Potato',
 'Root',
 'Sky',
 'Tree',
 'Tulip']

In [22]:
diff = set(list1) - set(list2)
diff

{'Orange'}

In [23]:
diff = set(list2) - set(list1)
diff

{'0', 'Potato'}

In [24]:
# nao temos orange, 0 e potato

In [25]:
partners.isnull().sum()

id                    0
created_at            0
updated_at            0
partner_type          0
lead_sales_contact    0
dtype: int64

In [29]:
partners[partners['created_at']==0]

Unnamed: 0,id,created_at,updated_at,partner_type,lead_sales_contact


In [30]:
partners[partners['updated_at']==0]

Unnamed: 0,id,created_at,updated_at,partner_type,lead_sales_contact


In [31]:
referrals[referrals['created_at']==0]

Unnamed: 0,id,created_at,updated_at,company_id,partner_id,consultant_id,status,is_outbound


In [32]:
referrals[referrals['updated_at']==0]

Unnamed: 0,id,created_at,updated_at,company_id,partner_id,consultant_id,status,is_outbound


In [33]:
referrals.shape

(1470, 8)

In [34]:
partners.shape

(522, 5)

In [35]:
len(referrals.id.unique())

1470

In [36]:
len(partners.id.unique())

522

In [39]:
partners.dtypes

id                      int64
created_at            float64
updated_at            float64
partner_type           object
lead_sales_contact     object
dtype: object

In [42]:
# check dates
partner_corrected = partners.astype({'created_at':'datetime64[ns]', 'updated_at': 'datetime64[ns]'})
partner_corrected.created_at.min()

Timestamp('2020-08-31 06:47:46.322480128')

In [44]:
partners['created_at_len'] = partners['created_at'].astype(str).map(len)
partners.head()

Unnamed: 0,id,created_at,updated_at,partner_type,lead_sales_contact,created_at_len
0,2,1.598856e+18,1.607047e+18,Agent,Potato,20
1,3,1.598859e+18,1.618882e+18,Agent,Lion,20
2,4,1.598859e+18,1.616642e+18,Agent,Potato,20
3,5,1.59886e+18,1.607331e+18,Agent,Lion,20
4,6,1.598866e+18,1.609743e+18,Agent,Potato,20


In [45]:
partners.created_at_len.unique()

array([20, 19, 17, 18])

In [46]:
partners.id.unique()

array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
        15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 119,
       120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
       172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 18

In [47]:
referrals.id.unique()

array([   1,    2,    3, ..., 1468, 1469, 1470])

In [48]:
referrals.partner_id.unique()

array([  4,   7,   8,   6,   2,  10,   9,  13,  14,  15,  17,  18,  19,
        20,  25,  26,  27,  28,  29,  30,  31,  32,  33,  21,  34,  35,
        22,  36,  38,  39,  42,  23,  41,  43,  16,  47,  48,  49,  50,
        52,  51,  12,  54,  44,  11,  55,  57,  58,  24,  60,  62,  63,
        64,  65,  72,  82,  83, 100,  66, 128, 129, 136,  69, 108,  67,
       147, 157, 137,   3, 117, 131, 169, 191,  81, 201, 149, 198, 155,
       114, 253, 184, 290, 161,  97, 302, 213, 334, 278, 247, 105, 142,
       356, 242, 360, 227, 140, 366,  40, 337, 252,   5,  80,  91, 116,
       386, 376, 330, 255, 458, 519])