## Pandas Data Types

In [1]:
import pandas as pd

In [2]:
weather = pd.read_csv('weather.csv')

In [4]:
weather

Unnamed: 0,date,temperature_high,temperature_low,rained,snowed,overcast,comments
0,2021-01-01,4.0,1,1,False,cloudy,happy new year
1,2021-01-02,11.0,2,0,False,sunny,second day
2,2021-01-03,3.0,2,0,False,foggy,third day
3,2021-01-04,6.0,2,0,False,sunny,first business day
4,2021-01-05,4.0,unknown,0,False,cloudy,second business day


In [6]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              5 non-null      object 
 1   temperature_high  5 non-null      float64
 2   temperature_low   5 non-null      object 
 3   rained            5 non-null      int64  
 4   snowed            5 non-null      bool   
 5   overcast          5 non-null      object 
 6   comments          5 non-null      object 
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 377.0+ bytes


In [7]:
weather.dtypes

date                 object
temperature_high    float64
temperature_low      object
rained                int64
snowed                 bool
overcast             object
comments             object
dtype: object

### There is an issue with date column. It's listed as an object and not as a date/time field. We will change that now.

#### The temp high is a float and needs to be int64. 

####  The temp low needs to be an int as well but its an object

#### due to the 'unknown' variable in the 5th record. 

#### There is an issue with rained column, it appears to have been intended as a bool column.

#### The overcast column should be a categoty column not just a object column.

#### Lastly, the comments column should be a string type

In [9]:
weather['temperature_high'] = weather['temperature_high'].astype('int8')

In [11]:
weather.dtypes

date                object
temperature_high      int8
temperature_low     object
rained               int64
snowed                bool
overcast            object
comments            object
dtype: object

In [13]:
weather['rained'] = weather['rained'].astype('bool')

In [15]:
weather.dtypes

date                object
temperature_high      int8
temperature_low     object
rained                bool
snowed                bool
overcast            object
comments            object
dtype: object

In [16]:
weather = weather.astype({'overcast':'category','comments':'string'})

In [18]:
weather.dtypes

date                        object
temperature_high              int8
temperature_low             object
rained                        bool
snowed                        bool
overcast                  category
comments            string[python]
dtype: object

In [19]:
# Notice that this doesn't work. This is due to the 'unknown' record in the 5th row. See below solution to this.
weather['temperature_low'] = weather['temperature_low'].astype('int8')

ValueError: invalid literal for int() with base 10: 'unknown'

In [20]:
# The 'pd.to_numeric' is used to convert to int and any records that do not fit will be converted to 'NaN' record. See below.
weather['temperature_low']=pd.to_numeric(weather['temperature_low'],errors='coerce')

In [22]:
weather['temperature_low']

0    1.0
1    2.0
2    2.0
3    2.0
4    NaN
Name: temperature_low, dtype: float64

In [23]:
## Works but more efficient see below: weather['date'] = weather['date'].astype('datetype64')
weather['date'] = pd.to_datetime(weather['date'])

In [24]:
weather['date']

0   2021-01-01
1   2021-01-02
2   2021-01-03
3   2021-01-04
4   2021-01-05
Name: date, dtype: datetime64[ns]

In [25]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              5 non-null      datetime64[ns]
 1   temperature_high  5 non-null      int8          
 2   temperature_low   4 non-null      float64       
 3   rained            5 non-null      bool          
 4   snowed            5 non-null      bool          
 5   overcast          5 non-null      category      
 6   comments          5 non-null      string        
dtypes: bool(2), category(1), datetime64[ns](1), float64(1), int8(1), string(1)
memory usage: 404.0 bytes


## Done with weather.csv dataset so now working on heros again.

In [44]:
hero_powers = pd.read_csv('superhero_powers.csv')
hero_dc = pd.read_excel('superhero_info.xlsx' ,sheet_name = 'DC Comics')
hero_marvel = pd.read_excel('superhero_info.xlsx' ,sheet_name = 'Marvel Comics')

In [28]:
hero_powers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Columns: 168 entries, hero_names to Omniscient
dtypes: bool(167), object(1)
memory usage: 114.1+ KB


In [29]:
hero_powers.head()

Unnamed: 0,hero_names,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,Energy Absorption,Flight,...,Web Creation,Reality Warping,Odin Force,Symbiote Costume,Speed Force,Phoenix Force,Molecular Dissipation,Vision - Cryo,Omnipresent,Omniscient
0,3-D Man,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,False,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
hero_powers['hero_names'] = hero_powers['hero_names'].astype('string')

In [31]:
hero_powers['hero_names']

0              3-D Man
1               A-Bomb
2           Abe Sapien
3             Abin Sur
4          Abomination
            ...       
662    Yellowjacket II
663               Ymir
664               Yoda
665            Zatanna
666               Zoom
Name: hero_names, Length: 667, dtype: string

In [32]:
## We have the option to change the dtype on import. Like this:
hero_powers = pd.read_csv('superhero_powers.csv',dtype={'hero_names':'string'})

In [34]:
hero_powers.dtypes

hero_names               string[python]
Agility                            bool
Accelerated Healing                bool
Lantern Power Ring                 bool
Dimensional Awareness              bool
                              ...      
Phoenix Force                      bool
Molecular Dissipation              bool
Vision - Cryo                      bool
Omnipresent                        bool
Omniscient                         bool
Length: 168, dtype: object

In [45]:
hero_dc.dtypes

name           object
Gender         object
Eye color      object
Race           object
Hair color     object
Height          int64
Publisher      object
Alignment      object
Weight        float64
dtype: object

In [46]:
hero_marvel.dtypes

name          object
Gender        object
Eye color     object
Race          object
Hair color    object
Height         int64
Publisher     object
Alignment     object
Weight         int64
dtype: object

In [47]:
hero_dype = {'name':'string',
             'Gender':'category',
             'Eye color':'string',
             'Race':'string',
             'Hair color':'string',
             'Publisher':'string',
             'Alignment':'category'}

In [48]:
hero_dc = pd.read_excel('superhero_info.xlsx',dtype = hero_dype, sheet_name = 'DC Comics')
hero_marvel = pd.read_excel('superhero_info.xlsx',dtype = hero_dype, sheet_name = 'Marvel Comics')

In [49]:
hero_dc.dtypes

name          string[python]
Gender              category
Eye color     string[python]
Race          string[python]
Hair color    string[python]
Height                 int64
Publisher     string[python]
Alignment           category
Weight               float64
dtype: object

In [50]:
hero_marvel.dtypes

name          string[python]
Gender              category
Eye color     string[python]
Race          string[python]
Hair color    string[python]
Height                 int64
Publisher     string[python]
Alignment           category
Weight                 int64
dtype: object