In [2]:
import numpy as np

feature = np.array([[-500.5],
                   [-100.1],
                   [10],
                   [99.9],
                   [800.8]])

In [3]:
from sklearn import preprocessing

scale = preprocessing.MinMaxScaler(feature_range=(0,1))

s_feature = scale.fit_transform(feature)

s_feature

array([[0.        ],
       [0.30769231],
       [0.39230001],
       [0.46138477],
       [1.        ]])

In [4]:
# mean = 0, std deviation = 1

scale_std = preprocessing.StandardScaler()

std_feature = scale_std.fit_transform(feature)

std_feature

array([[-1.33132083],
       [-0.38369077],
       [-0.12311617],
       [ 0.08965092],
       [ 1.74847687]])

In [5]:
std_feature.mean(), std_feature.std()

(4.4408920985006264e-17, 1.0)

In [25]:
# Discretization => discrete buckets or bins

age = np.array([[30],
               [29],
               [50],  
               [70],  
               [30],  
               [15],  
               [100]])

In [18]:
from sklearn.preprocessing import Binarizer

binary = Binarizer(18)

binary.fit_transform(age)

array([[1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1]])

In [27]:
np.digitize(age, bins=[30,58,85], right=True) # bin 0 (<30), bin 1(30 to <58), bin 2 (58 to <85), bin 3 (>85)

array([[0],
       [0],
       [1],
       [2],
       [0],
       [0],
       [3]])

In [14]:
np.digitize(age, bins=[18])

array([[1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1]])

In [31]:
nom = np.array([['Blue'],
               ['Red'],
               ['Yellow'],
               ['Red'],
               ['Blue']])

In [32]:
from sklearn.preprocessing import LabelBinarizer

label = LabelBinarizer()

label.fit_transform(nom)

array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0]])

In [33]:
two = [("Red", "Blue"),
      ("Green", "Red"),
      ("Yellow", "Blue"),
      ("Green", "Grey")]

In [34]:
from sklearn.preprocessing import MultiLabelBinarizer

multi = MultiLabelBinarizer()

multi.fit_transform(two)

array([[1, 0, 0, 1, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1],
       [0, 1, 1, 0, 0]])

In [35]:
multi.classes_

array(['Blue', 'Green', 'Grey', 'Red', 'Yellow'], dtype=object)

In [38]:
import pandas as pd

df = pd.DataFrame({"Feel":["SAgree", "Agree", "Agree", "DAgree", "NoIdea", "SAgree", "Agree"]})

mapper = {"SAgree":1,
         "Agree": 2,
         "NoIdea":3,
         "DAgree":4}

df["Feel"].replace(mapper)

0    1
1    2
2    2
3    4
4    3
5    1
6    2
Name: Feel, dtype: int64

In [40]:
df1 = pd.DataFrame({"Feel":["SAgree", "Agree", "Agree", "DAgree", "NoIdea", "SAgree", "Agree", "Agreeing a bit"]})

mapper1 = {"SAgree":1,
         "Agree": 2,
         "Agreeing a bit": 2.1,
         "NoIdea":3,
         "DAgree":4}

df1["Feel"].replace(mapper1)

0    1.0
1    2.0
2    2.0
3    4.0
4    3.0
5    1.0
6    2.0
7    2.1
Name: Feel, dtype: float64

In [1]:
data = [{"Blue": 4, "Red": 2},
       {"Blue": 3, "Red": 4},
       {"Red": 2, "Yellow": 4},
       {"Blue": 3, "Yellow": 3}]

from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
f = dv.fit_transform(data)

f

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 2., 4.],
       [3., 0., 3.]])

In [2]:
dv.get_feature_names()

['Blue', 'Red', 'Yellow']

In [4]:
import pandas as pd

df = pd.DataFrame(f, columns=dv.get_feature_names())
df

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,2.0,4.0
3,3.0,0.0,3.0


In [5]:
import numpy as np

dt_strings = np.array(['10-08-2018 12:30 PM',
                      '11-08-2018 2:40 AM',
                      '08-08-2018 10:10 PM'])

In [6]:
import pandas as pd

[pd.to_datetime(date, format='%m-%d-%Y %I:%M %p') for date in dt_strings] #%S

[Timestamp('2018-10-08 12:30:00'),
 Timestamp('2018-11-08 02:40:00'),
 Timestamp('2018-08-08 22:10:00')]

In [11]:
dt_london = pd.Timestamp('2018-10-08 12:30:00', tz="Europe/London")

In [16]:
from pytz import all_timezones

all_timezones

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Asmera',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau',
 'Africa/Blantyre',
 'Africa/Brazzaville',
 'Africa/Bujumbura',
 'Africa/Cairo',
 'Africa/Casablanca',
 'Africa/Ceuta',
 'Africa/Conakry',
 'Africa/Dakar',
 'Africa/Dar_es_Salaam',
 'Africa/Djibouti',
 'Africa/Douala',
 'Africa/El_Aaiun',
 'Africa/Freetown',
 'Africa/Gaborone',
 'Africa/Harare',
 'Africa/Johannesburg',
 'Africa/Juba',
 'Africa/Kampala',
 'Africa/Khartoum',
 'Africa/Kigali',
 'Africa/Kinshasa',
 'Africa/Lagos',
 'Africa/Libreville',
 'Africa/Lome',
 'Africa/Luanda',
 'Africa/Lubumbashi',
 'Africa/Lusaka',
 'Africa/Malabo',
 'Africa/Maputo',
 'Africa/Maseru',
 'Africa/Mbabane',
 'Africa/Mogadishu',
 'Africa/Monrovia',
 'Africa/Nairobi',
 'Africa/Ndjamena',
 'Africa/Niamey',
 'Africa/Nouakchott',
 'Africa/Ouagadougou',
 'Africa/Porto-Novo',
 'Africa/Sao_Tome',
 'Africa/Timbuktu',
 'Africa/

In [14]:
dt_inpol = dt_london.tz_convert('America/Indianapolis')
dt_inpol

Timestamp('2018-10-08 07:30:00-0400', tz='America/Indianapolis')

In [18]:
dt_calcutta = dt_inpol.tz_convert('Asia/Calcutta')
dt_calcutta

Timestamp('2018-10-08 17:00:00+0530', tz='Asia/Calcutta')

In [20]:
df_dates = pd.DataFrame()

df_dates['date'] = pd.date_range('1/1/2017', periods=1000, freq='H') #M, W, D

df_dates

Unnamed: 0,date
0,2017-01-01 00:00:00
1,2017-01-01 01:00:00
2,2017-01-01 02:00:00
3,2017-01-01 03:00:00
4,2017-01-01 04:00:00
5,2017-01-01 05:00:00
6,2017-01-01 06:00:00
7,2017-01-01 07:00:00
8,2017-01-01 08:00:00
9,2017-01-01 09:00:00


In [21]:
df_dates[(df_dates['date'] > '2017-02-11 11:00:00')]

Unnamed: 0,date
996,2017-02-11 12:00:00
997,2017-02-11 13:00:00
998,2017-02-11 14:00:00
999,2017-02-11 15:00:00


In [28]:
df_dates = df_dates.set_index(df_dates['date'])

df_dates.loc['2017-02-11 14:00:00':'2017-02-11 15:00:00']

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2017-02-11 14:00:00,2017-02-11 14:00:00
2017-02-11 15:00:00,2017-02-11 15:00:00


In [29]:
df_dates.head(3)

Unnamed: 0_level_0,date
date,Unnamed: 1_level_1
2017-01-01 00:00:00,2017-01-01 00:00:00
2017-01-01 01:00:00,2017-01-01 01:00:00
2017-01-01 02:00:00,2017-01-01 02:00:00


In [33]:
df_dates['year'] = df_dates['date'].dt.year
df_dates['month'] = df_dates['date'].dt.month #day, hour, minute, weekday, weekday_name...

df_dates

Unnamed: 0_level_0,date,year,month
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01 00:00:00,2017-01-01 00:00:00,2017,1
2017-01-01 01:00:00,2017-01-01 01:00:00,2017,1
2017-01-01 02:00:00,2017-01-01 02:00:00,2017,1
2017-01-01 03:00:00,2017-01-01 03:00:00,2017,1
2017-01-01 04:00:00,2017-01-01 04:00:00,2017,1
2017-01-01 05:00:00,2017-01-01 05:00:00,2017,1
2017-01-01 06:00:00,2017-01-01 06:00:00,2017,1
2017-01-01 07:00:00,2017-01-01 07:00:00,2017,1
2017-01-01 08:00:00,2017-01-01 08:00:00,2017,1
2017-01-01 09:00:00,2017-01-01 09:00:00,2017,1


In [36]:
df_many = pd.DataFrame()

df_many['First'] = [pd.Timestamp('01-10-2018'), pd.Timestamp('04-01-2018')]
df_many['Second'] = [pd.Timestamp('01-11-2018'), pd.Timestamp('05-03-2018')]

df_many

Unnamed: 0,First,Second
0,2018-01-10,2018-01-11
1,2018-04-01,2018-05-03


In [37]:
df_many['Second'] - df_many['First']

0    1 days
1   32 days
dtype: timedelta64[ns]

In [38]:
df_dates = pd.date_range("01/01/2017", periods=7, freq="M")

df_dates

DatetimeIndex(['2017-01-31', '2017-02-28', '2017-03-31', '2017-04-30',
               '2017-05-31', '2017-06-30', '2017-07-31'],
              dtype='datetime64[ns]', freq='M')

In [39]:
df = pd.DataFrame(index=df_dates)

In [40]:
df

2017-01-31
2017-02-28
2017-03-31
2017-04-30
2017-05-31
2017-06-30
2017-07-31


In [41]:
df["height"] = [23,23.1,23.4,23.5,23.5,23.6,23.7]

df

Unnamed: 0,height
2017-01-31,23.0
2017-02-28,23.1
2017-03-31,23.4
2017-04-30,23.5
2017-05-31,23.5
2017-06-30,23.6
2017-07-31,23.7


In [42]:
df.rolling(window=2).mean()

Unnamed: 0,height
2017-01-31,
2017-02-28,23.05
2017-03-31,23.25
2017-04-30,23.45
2017-05-31,23.5
2017-06-30,23.55
2017-07-31,23.65


In [43]:
df.rolling(window=3).mean()

Unnamed: 0,height
2017-01-31,
2017-02-28,
2017-03-31,23.166667
2017-04-30,23.333333
2017-05-31,23.466667
2017-06-30,23.533333
2017-07-31,23.6


In [44]:
time_index_for_calculation = pd.date_range("01/01/2017", periods=5, freq="W")

df_weeks = pd.DataFrame(index=time_index_for_calculation)

df_weeks

2017-01-01
2017-01-08
2017-01-15
2017-01-22
2017-01-29


In [45]:
df_weeks["Price"] = [24.5, 25.6,np.nan, np.nan, 29.5]

df_weeks

Unnamed: 0,Price
2017-01-01,24.5
2017-01-08,25.6
2017-01-15,
2017-01-22,
2017-01-29,29.5


In [47]:
df_interpolation = df_weeks.interpolate()
df_interpolation

Unnamed: 0,Price
2017-01-01,24.5
2017-01-08,25.6
2017-01-15,26.9
2017-01-22,28.2
2017-01-29,29.5


In [48]:
df_weeks

Unnamed: 0,Price
2017-01-01,24.5
2017-01-08,25.6
2017-01-15,
2017-01-22,
2017-01-29,29.5


In [49]:
df_backfilling = df_weeks.bfill()
df_backfilling

Unnamed: 0,Price
2017-01-01,24.5
2017-01-08,25.6
2017-01-15,29.5
2017-01-22,29.5
2017-01-29,29.5


In [50]:
df_forwardfilling = df_weeks.ffill()
df_forwardfilling

Unnamed: 0,Price
2017-01-01,24.5
2017-01-08,25.6
2017-01-15,25.6
2017-01-22,25.6
2017-01-29,29.5


In [52]:
df_weeks.interpolate?