In [None]:
import numpy as np
import pandas as pd
import sklearn

In [None]:
# Downloading the data of AMAZON

def fetch_initial_data(company = 'AMZN'):
  import pandas_datareader as web
  return web.DataReader(name=company, data_source = 'stooq')

df_raw = fetch_initial_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-11-01,133.96,137.35,133.71,137.0,61529409
2023-10-31,132.75,133.57,131.71,133.09,51589380
2023-10-30,129.72,133.0,128.56,132.71,72485542
2023-10-27,126.2,130.02,125.52,127.74,125309313
2023-10-26,120.63,121.6393,118.35,119.57,100419516


In [None]:
# Making the copy and cutting the first 5 rows
df = df_raw.copy()
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2023-11-01 to 2023-10-26
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


In [None]:
# Generation of new columns / data
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-11-01,133.96,137.35,133.71,137.0,61529409,1,11,2023
2023-10-31,132.75,133.57,131.71,133.09,51589380,31,10,2023
2023-10-30,129.72,133.0,128.56,132.71,72485542,30,10,2023
2023-10-27,126.2,130.02,125.52,127.74,125309313,27,10,2023
2023-10-26,120.63,121.6393,118.35,119.57,100419516,26,10,2023


In [None]:
# Creating new DF
df = pd.DataFrame(data={'height': [175., 178.5, 185., 191., 184.5, 183., 168.]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [None]:
# Adjusting the data in height column into 3 groups
df['height_cat'] = pd.cut(x = df.height, bins = 3)
df

Unnamed: 0,height,height_cat
0,175.0,"(167.977, 175.667]"
1,178.5,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [None]:
# Doing the same but for precised by us
df['height_cat'] = pd.cut(x = df.height, bins = (160,175,180,195))
df

Unnamed: 0,height,height_cat
0,175.0,"(160, 175]"
1,178.5,"(175, 180]"
2,185.0,"(180, 195]"
3,191.0,"(180, 195]"
4,184.5,"(180, 195]"
5,183.0,"(180, 195]"
6,168.0,"(160, 175]"


In [None]:
# Adding the text information instead of numbers
df['height_cat'] = pd.cut(x = df.height, bins = (160,175,180,195), labels = ['small', 'medium', 'high'])
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.5,medium
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [None]:
# Preparing data for the data model
pd.get_dummies(df, drop_first=True, prefix='height')

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


In [None]:
df = pd.DataFrame(data={'lang': [['PL', 'ENG'], ['PL', 'ENG', 'RUS'], ['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[PL, ENG, RUS]"
2,[RUS]


In [None]:
# Adding the column of lang number
df['lang_number'] = df['lang'].apply(len)
df

Unnamed: 0,lang,lang_number
0,"[PL, ENG]",2
1,"[PL, ENG, RUS]",3
2,[RUS],1


In [None]:
# Counting if the PL is in the lang list
df['PL_Lang'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lang_number,PL_Lang
0,"[PL, ENG]",2,1
1,"[PL, ENG, RUS]",3,1
2,[RUS],1,0
