In [9]:
import sklearn
print(sklearn.__version__)

1.3.0


In [32]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}
df=pd.DataFrame(data)


In [None]:

print(round(df.isnull().sum()/len(df),2))

In [None]:
imputer=SimpleImputer(missing_values=np.nan, strategy='mean')
df['weight']=imputer.fit_transform(df[['weight']])
print(float(imputer.statistics_))

In [None]:
imputer_const=SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=99)
df['price']=imputer_const.fit_transform(df[['price']])
print(df)

In [40]:
temp=(df[~df['weight'].isnull()])
print(temp.mean(numeric_only=True))

price     122.333333
weight    415.000000
dtype: float64


In [None]:
imputer_size=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[['size']]=imputer_size.fit_transform(df[['size']])
print(df)

In [51]:
df_object=df.select_dtypes(include=['object']).fillna('empty')
print(df_object)

Unnamed: 0,size,color,gender,bought
0,XL,red,female,yes
1,L,green,male,no
2,M,blue,empty,yes
3,empty,green,female,no
4,M,red,female,yes
5,M,green,male,no


In [54]:
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83., 68.]})
df['weight_cut']=pd.cut(df['weight'],bins=3)
print(df)   # 3 bins of equal size

   weight        weight_cut
0    75.0  (67.977, 75.667]
1    78.5  (75.667, 83.333]
2    85.0    (83.333, 91.0]
3    91.0    (83.333, 91.0]
4    84.5    (83.333, 91.0]
5    83.0  (75.667, 83.333]
6    68.0  (67.977, 75.667]


In [57]:
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83., 68.]})
df['weight_cut']=pd.cut(df['weight'],bins=[60,75,80,95],labels=['light','normal','heavy'])
print(df)   # 3 bins of equal size

   weight weight_cut
0    75.0      light
1    78.5     normal
2    85.0      heavy
3    91.0      heavy
4    84.5      heavy
5    83.0      heavy
6    68.0      light


In [63]:
df=(pd.get_dummies(df))
print(df)

   weight  weight_cut_light  weight_cut_normal  weight_cut_heavy
0    75.0              True              False             False
1    78.5             False               True             False
2    85.0             False              False              True
3    91.0             False              False              True
4    84.5             False              False              True
5    83.0             False              False              True
6    68.0              True              False             False


In [74]:
data_dict = {
    'currency': [
        ['PLN', 'USD'],
        ['EUR', 'USD', 'PLN', 'CAD'],
        ['GBP'],
        ['JPY', 'CZK', 'HUF'],
        [],
    ]
}
df = pd.DataFrame(data=data_dict)



In [None]:
df['number']=df['currency'].apply(len)
print(df)

In [81]:
df['PLN_flag']=df['currency'].apply(lambda x: 1 if 'PLN' in x else 0)
df

Unnamed: 0,currency,PLN_flag
0,"[PLN, USD]",1
1,"[EUR, USD, PLN, CAD]",1
2,[GBP],0
3,"[JPY, CZK, HUF]",0
4,[],0


In [89]:

df = pd.DataFrame(
    data={
        'hashtags': [
            '#good#vibes',
            '#hot#summer#holiday',
            '#street#food',
            '#workout',
        ]
    }
)
df=df['hashtags'].str.split('#',expand=True)
df.drop(columns=[0],inplace=True)
df.columns=['hashtag1','hashtag2','hashtag3'] 
df

Unnamed: 0,hashtag1,hashtag2,hashtag3
0,good,vibes,
1,hot,summer,holiday
2,street,food,
3,workout,,


In [91]:
df['missing']=df.isnull().sum(axis=1)
df

Unnamed: 0,hashtag1,hashtag2,hashtag3,missing
0,good,vibes,,
1,hot,summer,holiday,
2,street,food,,
3,workout,,,


In [95]:


df = pd.DataFrame(
    data={
        'investments': [
            '100_000_000',
            '100_000',
            '30_000_000',
            '100_500_000',
        ]
    }
)
df['investments']=df['investments'].str.replace('_','')
df['investments']=df['investments'].astype(int)
df

Unnamed: 0,investments
0,100000000
1,100000
2,30000000
3,100500000
