In [34]:
import numpy as np
import pandas as pd
import re

In [35]:
df = pd.read_csv('../datasets/raw/CoffeeAndCode_2018.csv')
df.head()

Unnamed: 0,CodingHours,CoffeeCupsPerDay,CoffeeTime,CodingWithoutCoffee,CoffeeType,CoffeeSolveBugs,Gender,Country,AgeRange
0,8,2,Before coding,Yes,Caffè latte,Sometimes,Female,Lebanon,18 to 29
1,3,2,Before coding,Yes,Americano,Yes,Female,Lebanon,30 to 39
2,5,3,While coding,No,Nescafe,Yes,Female,Lebanon,18 to 29
3,8,2,Before coding,No,Nescafe,Yes,Male,Lebanon,
4,10,3,While coding,Sometimes,Turkish,No,Male,Lebanon,18 to 29


In [36]:
def convert_camel_case_to_snake_case(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

df.columns = [convert_camel_case_to_snake_case(col) for col in df.columns]
del convert_camel_case_to_snake_case

In [37]:
df.head()

Unnamed: 0,coding_hours,coffee_cups_per_day,coffee_time,coding_without_coffee,coffee_type,coffee_solve_bugs,gender,country,age_range
0,8,2,Before coding,Yes,Caffè latte,Sometimes,Female,Lebanon,18 to 29
1,3,2,Before coding,Yes,Americano,Yes,Female,Lebanon,30 to 39
2,5,3,While coding,No,Nescafe,Yes,Female,Lebanon,18 to 29
3,8,2,Before coding,No,Nescafe,Yes,Male,Lebanon,
4,10,3,While coding,Sometimes,Turkish,No,Male,Lebanon,18 to 29


In [38]:
df.shape

(100, 9)

In [39]:
memory_before = df.memory_usage(deep=True).sum()

In [40]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   coding_hours           100 non-null    int64 
 1   coffee_cups_per_day    100 non-null    int64 
 2   coffee_time            100 non-null    object
 3   coding_without_coffee  100 non-null    object
 4   coffee_type            99 non-null     object
 5   coffee_solve_bugs      100 non-null    object
 6   gender                 100 non-null    object
 7   country                100 non-null    object
 8   age_range              98 non-null     object
dtypes: int64(2), object(7)
memory usage: 46.0 KB


In [41]:
df.fillna({
    'coffee_type': df.coffee_type.mode()[0],
    'age_range':  df.age_range.mode()[0]
}, inplace=True)

In [42]:
for item in df.columns:
    print(f'Coluna: {item}\nValores únicos: {len(df[item].unique())}', end=f"\n{'*'*50}\n")

Coluna: coding_hours
Valores únicos: 10
**************************************************
Coluna: coffee_cups_per_day
Valores únicos: 8
**************************************************
Coluna: coffee_time
Valores únicos: 7
**************************************************
Coluna: coding_without_coffee
Valores únicos: 3
**************************************************
Coluna: coffee_type
Valores únicos: 8
**************************************************
Coluna: coffee_solve_bugs
Valores únicos: 3
**************************************************
Coluna: gender
Valores únicos: 2
**************************************************
Coluna: country
Valores únicos: 1
**************************************************
Coluna: age_range
Valores únicos: 5
**************************************************


In [43]:
df.describe()

Unnamed: 0,coding_hours,coffee_cups_per_day
count,100.0,100.0
mean,6.41,2.89
std,2.644205,1.613673
min,1.0,1.0
25%,4.0,2.0
50%,7.0,2.5
75%,8.0,4.0
max,10.0,8.0


In [44]:
df.describe(exclude='int64')

Unnamed: 0,coffee_time,coding_without_coffee,coffee_type,coffee_solve_bugs,gender,country,age_range
count,100,100,100,100,100,100,100
unique,7,3,8,3,2,1,5
top,While coding,Sometimes,Nescafe,Sometimes,Male,Lebanon,18 to 29
freq,61,51,33,43,74,100,62


In [45]:
df.coding_hours = df.coding_hours.astype('int8')
df.coffee_cups_per_day = df.coffee_cups_per_day.astype('int8')

In [46]:
print(df.coffee_time.unique())
print(df.coding_without_coffee.unique())
print(df.coffee_type.unique())
print(df.coffee_solve_bugs.unique())
print(df.gender.unique())
print(df.country.unique())
print(df.age_range.unique())

['Before coding' 'While coding' 'Before and while coding' 'In the morning'
 'All the time' 'After coding' 'No specific time']
['Yes' 'No' 'Sometimes']
['Caffè latte' 'Americano' 'Nescafe' 'Turkish' 'American Coffee'
 'Espresso (Short Black)' 'Cappuccino' 'Double Espresso (Doppio)']
['Sometimes' 'Yes' 'No']
['Female' 'Male']
['Lebanon']
['18 to 29' '30 to 39' '40 to 49' 'Under 18' '50 to 59']


In [47]:
df.coffee_time = df.coffee_time.astype('category')
df.coding_without_coffee = df.coding_without_coffee.astype('category')
df.coffee_type = df.coffee_type.astype('category')
df.coffee_solve_bugs = df.coffee_solve_bugs.astype('category')
df.gender = df.gender.astype('category')
df.drop(columns='country', inplace=True)

In [48]:
def rename_age_range(age_range):
    if age_range == 'Under 18':
        return '0 ~ 18'
    elif age_range == '18 to 29':
        return '18 ~ 29'
    elif age_range == '30 to 39':
        return '30 ~ 39'
    elif age_range == '40 to 49':
        return '40 ~ 49'
    elif age_range == '50 to 59':
        return '50 ~ 59'
    else:
        return age_range

df.age_range = df.age_range.apply(rename_age_range).astype('category')
del rename_age_range

In [49]:
memory_after = df.memory_usage(deep=True).sum()

In [50]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   coding_hours           100 non-null    int8    
 1   coffee_cups_per_day    100 non-null    int8    
 2   coffee_time            100 non-null    category
 3   coding_without_coffee  100 non-null    category
 4   coffee_type            100 non-null    category
 5   coffee_solve_bugs      100 non-null    category
 6   gender                 100 non-null    category
 7   age_range              100 non-null    category
dtypes: category(6), int8(2)
memory usage: 3.8 KB


In [51]:
print (f'{memory_after/memory_before*100: .2f}% da memória original')

 8.36% da memória original


In [52]:
df.to_pickle('../datasets/processed/CoffeeAndCode_2018.pkl')