## How do I change the data type of a pandas series? 

In [2]:
import pandas as pd

In [3]:
drinks_path = 'http://bit.ly/drinksbycountry'
train_path = 'http://bit.ly/kaggletrain'
orders_path = 'http://bit.ly/chiporders'
drinks = pd.read_csv(drinks_path)
train = pd.read_csv(train_path)
orders = pd.read_table(orders_path)

In [4]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [5]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

### How to find specific formats within a dataframe

In [6]:
drinks = pd.read_csv(drinks_path)
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [7]:
#IMPORTANT: PANDAS
#read #numeric #non-numeric #select #pandas #numpy
import numpy as np
drinks.select_dtypes(include=[np.number]).dtypes

beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
dtype: object

### How to change number formats with astype (while reading a csv)

In [8]:
#IMPORTANT: PANDAS
#astype #change #prepare #datatype #type #pandas

#changing int to float
drinks['beer_servings']=drinks.beer_servings.astype('float')

In [9]:
#IMPORTANT: PANDAS
#dtype #change #read #prepare #datatype #type #pandas

#changing int to float while reading
drinks = pd.read_csv(drinks_path,dtype={'beer_servings':'float'})
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

### How to change more than one format at once

In [10]:
#IMPORTANT: PANDAS
#astype #change #read #prepare #datatype #type #pandas

#convert more than one datatype at once
drinks = drinks.astype({'beer_servings': 'float', 'spirit_servings': 'float'})
drinks.dtypes

country                          object
beer_servings                   float64
spirit_servings                 float64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

### How to handle special formats like $

In [11]:
orders = pd.read_table('http://bit.ly/chiporders')

In [12]:
#Problem with data-type currency
orders.head(2)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,$2.39
1,1,1,Izze,[Clementine],$3.39


In [13]:
orders.dtypes

order_id               int64
quantity               int64
item_name             object
choice_description    object
item_price            object
dtype: object

In [14]:
#IMPORTANT: PANDAS
#str.replace #str #change #prepare #datatype #type #pandas

#replace the currency and define datatype
orders.item_price.str.replace('$',"").astype(float).head(4)

0    2.39
1    3.39
2    3.39
3    2.39
Name: item_price, dtype: float64

In [15]:
#IMPORTANT: PANDAS
#str #contains #str.contains #change #ML #read #prepare #datatype #type #pandas

#How to convert True / False in 0 and 1 for Machine Learning
orders.item_name.str.contains('Chicken').astype(int).head()

0    0
1    0
2    0
3    0
4    1
Name: item_name, dtype: int64

### How to convert strings to numbers with astype and pd.to_numeric

In [16]:
#create a dataframe
df = pd.DataFrame({'col_one': ['1.1','2.2','3.3'],
                    'col_two':['4.4','5.5','6.6'],
                    'col_three':['7.7','8.8','-']})
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [17]:
df.astype({'col_one':'float', 'col_two':'float'}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [18]:
#IMPORTANT: PANDAS
#to_numeric #change #datatype #type #prepare #bestpractice #pandas

#changing col_three to numeric, change '-' to NaN by 'coerce'
pd.to_numeric(df.col_three, errors='coerce')

0    7.7
1    8.8
2    NaN
Name: col_three, dtype: float64

In [19]:
#IMPORTANT: PANDAS
#to_numeric #change #datatype #type #prepare #bestpractice #pandas

#changing col_three to numeric, change '-' to NaN by 'coerce'
#filling NaN with 0
pd.to_numeric(df.col_three, errors='coerce').fillna(0)

0    7.7
1    8.8
2    0.0
Name: col_three, dtype: float64

In [20]:
#IMPORTANT: PANDAS
#to_numeric #change #datatype #type #prepare #bestpractice #pandas

#applying to_numeric to the whole dataframe
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
df.dtypes

col_one      float64
col_two      float64
col_three    float64
dtype: object

### How to convert continous data into categorical data with pd.cut

In [21]:
train.Age.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [22]:
#IMPORTANT: PANDAS
#cut #change #datatype #ML #continoustocategorical #bins #type #prepare #bestpractice #pandas

#bins are 0-18: child, 18-25: young adult, 25-99: adult
pd.cut(train.Age, bins=[0,18,25,99], labels=['child', 'young adult', 'adult']).head(10)

0    young adult
1          adult
2          adult
3          adult
4          adult
5            NaN
6          adult
7          child
8          adult
9          child
Name: Age, dtype: category
Categories (3, object): ['child' < 'young adult' < 'adult']