In [1]:
import pandas as pd
scientists = pd.read_csv('C:/Data/scientists.csv')

In [2]:
# The data types of the columns are strings as shown by dtype on the date values in Born and Died columns
print(scientists['Born'].dtype)

object


In [3]:
print(scientists['Died'].dtype) # They need to be converted to the proper data type

object


In [4]:
# Born (string) to Born (datetime)
bornDateTime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
print(bornDateTime)

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]


In [5]:
diedDatetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')
print(diedDatetime)

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]


In [6]:
# Creating new columns in a DataFrame
scientists['BornDatetime'], scientists['DiedDatetime'] = (bornDateTime, diedDatetime)
print(scientists.head(n=2))

                Name        Born        Died  Age    Occupation BornDatetime  \
0  Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist   1920-07-25   
1     William Gosset  1876-06-13  1937-10-16   61  Statistician   1876-06-13   

  DiedDatetime  
0   1958-04-16  
1   1937-10-16  


In [7]:
print(scientists.shape)

(8, 7)


In [8]:
print(scientists['BornDatetime'].dtype)

datetime64[ns]


In [9]:
print(scientists['DiedDatetime'].dtype)

datetime64[ns]


In [10]:
# Changing columns
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [11]:
# shuffling values in Age column
import random
random.seed(42)
random.shuffle(scientists['Age'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  x[i], x[j] = x[j], x[i]


In [12]:
print(scientists['Age']) # compare with previous values

0    66
1    56
2    41
3    77
4    90
5    45
6    37
7    61
Name: Age, dtype: int64


In [13]:
scientists['Age'] = scientists['Age'].sample(len(scientists['Age']), random_state=24).reset_index(drop=True)
print(scientists['Age']) # Sample works directly on the sequence, in place.

0    61
1    45
2    37
3    90
4    56
5    66
6    77
7    41
Name: Age, dtype: int64


In [14]:
# Create a new column of age using the columns created previously
scientists['Age2'] = (scientists['DiedDatetime'] - scientists['BornDatetime'])

In [15]:
print(scientists['Age2'])

0   13779 days
1   22404 days
2   32964 days
3   24345 days
4   20777 days
5   16529 days
6   15324 days
7   28422 days
Name: Age2, dtype: timedelta64[ns]


In [16]:
# Convert to a year value.
scientists['Age2'] = scientists['Age2'].astype('timedelta64[Y]')
print(scientists['Age2']) # In years

0    37.0
1    61.0
2    90.0
3    66.0
4    56.0
5    45.0
6    41.0
7    77.0
Name: Age2, dtype: float64


In [17]:
print(scientists.columns)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'BornDatetime',
       'DiedDatetime', 'Age2'],
      dtype='object')


In [18]:
# Dropping columns
scientistsDroppedColumns = scientists.drop(['Age'], axis=1)

# Results of drop
print(scientistsDroppedColumns.columns)

Index(['Name', 'Born', 'Died', 'Occupation', 'BornDatetime', 'DiedDatetime',
       'Age2'],
      dtype='object')
