# Lesson #6

## Optimization in Pandas

In [85]:
data_list = list(range(10000))
data_list[:5]

[0, 1, 2, 3, 4]

In [86]:
%%time

if 5000 in data_list:
    print('Found')

Found
CPU times: total: 0 ns
Wall time: 0 ns


In [87]:
%%time

for _ in range(10 ** 5):
    if 5000 in data_list:
        pass

CPU times: total: 4.53 s
Wall time: 4.55 s


In [88]:
set([3, 1, 2])

{1, 2, 3}

In [89]:
data_set = set(range(10000))

In [90]:
%%time

for _ in range(10 ** 5):
    if 5000 in data_set:
        pass

CPU times: total: 0 ns
Wall time: 6.98 ms


In [91]:
import pandas as pd

In [92]:
table = pd.read_csv('data/test/links.csv')
table.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [93]:
%%time

for _ in range(10 ** 4):
    table[(table.movieId=='1') & (table.imdbId==113041)]

CPU times: total: 3.34 s
Wall time: 3.32 s


In [94]:
# Another approach with list comprehension
with open('data/test/links.csv') as f:
    next(f)
    table = [x.strip().split(',') for x in f.readlines()]

In [95]:
table[:2]

[['1', '0114709', '862'], ['2', '0113497', '8844']]

In [96]:
%%time

#                movieId,  imdbId,  tmdbId
line_to_search = ['3952','0208874', '6521']

for _ in range(10 ** 4):
    [x for x in table if x[:3] == line_to_search]

CPU times: total: 11.8 s
Wall time: 11.8 s


In [97]:
# Can we do faster ?
tuple_to_search = ('3952', '6521')

In [98]:
dict_ = {}

for line in table:
    key = tuple(line[:1] + line[2:])
    value = line[1]
    
    dict_[key] = value

In [99]:
dict_

{('1', '862'): '0114709',
 ('2', '8844'): '0113497',
 ('3', '15602'): '0113228',
 ('4', '31357'): '0114885',
 ('5', '11862'): '0113041',
 ('6', '949'): '0113277',
 ('7', '11860'): '0114319',
 ('8', '45325'): '0112302',
 ('9', '9091'): '0114576',
 ('10', '710'): '0113189',
 ('11', '9087'): '0112346',
 ('12', '12110'): '0112896',
 ('13', '21032'): '0112453',
 ('14', '10858'): '0113987',
 ('15', '1408'): '0112760',
 ('16', '524'): '0112641',
 ('17', '4584'): '0114388',
 ('18', '5'): '0113101',
 ('19', '9273'): '0112281',
 ('20', '11517'): '0113845',
 ('21', '8012'): '0113161',
 ('22', '1710'): '0112722',
 ('23', '9691'): '0112401',
 ('24', '12665'): '0114168',
 ('25', '451'): '0113627',
 ('26', '16420'): '0114057',
 ('27', '9263'): '0114011',
 ('28', '17015'): '0114117',
 ('29', '902'): '0112682',
 ('30', '37557'): '0115012',
 ('31', '9909'): '0112792',
 ('32', '63'): '0114746',
 ('34', '9598'): '0112431',
 ('35', '47018'): '0112637',
 ('36', '687'): '0112818',
 ('37', '139405'): '0112286

In [100]:
%%time

for _ in range(10 ** 4):
    dict_[tuple_to_search]

CPU times: total: 0 ns
Wall time: 2 ms


# Part 2

- pivot tables
- filters & calculations with `loc` method
- filter empty values via `isnull`
- time in pandas
- row methods
- word forms

In [101]:
import pandas as pd

In [102]:
ratings = pd.read_csv('data/test/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [103]:
# Find users who always put high rating and those who always put low ratings

# Firstly, find all rating put by user #1 
user_1_ratings = ratings[(ratings['userId'] == 1)]
user_1_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [104]:
print(len(user_1_ratings[(user_1_ratings['rating'] == 1)]))
print(len(user_1_ratings[(user_1_ratings['rating'] == 2)]))
print(len(user_1_ratings[(user_1_ratings['rating'] == 3)]))
print(len(user_1_ratings[(user_1_ratings['rating'] == 4)]))
print(len(user_1_ratings[(user_1_ratings['rating'] == 5)]))

2
7
4
3
0


In [105]:
# Or
ratings[(ratings.userId == 1)].rating.value_counts()

2.0    7
3.0    4
2.5    3
4.0    3
1.0    2
3.5    1
Name: rating, dtype: int64

Pivot tables

In [106]:
ratings_pivot = ratings.pivot_table(index='userId', columns='rating', values='timestamp', aggfunc='count', fill_value=0, margins=True, margins_name='Ratings count')
ratings_pivot.head()

rating,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,Ratings count
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,2,0,7,3,4,1,3,0,0,20
2,0,2,0,4,0,36,0,23,0,11,76
3,0,0,0,1,3,18,9,11,4,5,51
4,0,5,0,5,0,23,0,52,0,119,204
5,0,0,1,0,3,3,27,42,19,5,100


In [107]:
ratings_pivot[5.0].sort_values(ascending=False).head()

userId
Ratings count    15095
564                408
232                243
242                219
547                214
Name: 5.0, dtype: int64

In [108]:
ratings_pivot.sort_values(by=5.0, ascending=False).head()


rating,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,Ratings count
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Ratings count,1101,3326,1687,7271,4449,20064,10538,28750,7723,15095,100004
564,0,152,0,187,0,414,0,707,0,408,1868
232,0,35,0,32,0,96,0,276,0,243,682
242,0,0,0,4,0,25,0,151,0,219,399
547,53,79,58,204,177,411,378,591,226,214,2391


Get most visited pages

In [109]:
log = pd.read_csv('data/test/visit_log.csv', sep=';')
log.head()

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex


In [110]:
# Count unique pages and get most visited pages
print(log.url.nunique())

log['url_path'] = log.url.apply(lambda url: url.replace('https://host.ru/', ''))
log.url_path.value_counts().head()

38


3c19b4ef7371864fa3    541
108ce4b365afb7b88e    534
23c64743ad842b9311    532
1afc5618562365020a    529
2fcb223266880ccf57    526
Name: url_path, dtype: int64

Filters and calculations with `loc` & `iloc`

In [111]:
# method `loc` allows to select rows & columns with conditions
# `:` means that all values are selected
log.loc[:, ['user_id', 'region']].head()

Unnamed: 0,user_id,region
0,b1613cc09f,Russia
1,4c3ec14bee,Russia
2,a8c40697fb,Russia
3,521ac1d6a0,Russia
4,d7323c571c,Russia


In [112]:
# pretty much the same but using indexes
log.iloc[:, [2, 4]].head()

Unnamed: 0,url,user_id
0,https://host.ru/3c19b4ef7371864fa3,b1613cc09f
1,https://host.ru/c8d9213a31839f9a3a,4c3ec14bee
2,https://host.ru/b8b58337d272ee7b15,a8c40697fb
3,https://host.ru/b8b58337d272ee7b15,521ac1d6a0
4,https://host.ru/b8b58337d272ee7b15,d7323c571c


In [113]:
# example using filters (conditions)
log.loc[log.region == 'Russia'].head()

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source,url_path
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex,3c19b4ef7371864fa3
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct,c8d9213a31839f9a3a
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex,b8b58337d272ee7b15
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex,b8b58337d272ee7b15
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex,b8b58337d272ee7b15


In [114]:
# example of setting new row according to given condtion
log.loc[log.region == 'Russia', 'VAT'] = 1.2
log.head(9)

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source,url_path,VAT
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex,3c19b4ef7371864fa3,1.2
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct,c8d9213a31839f9a3a,1.2
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex,b8b58337d272ee7b15,1.2
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex,b8b58337d272ee7b15,1.2
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex,b8b58337d272ee7b15,1.2
5,1549980742,8855508aad,https://host.ru/df646c3676cc259fa0,Russia,fc43898e47,yandex,df646c3676cc259fa0,1.2
6,1549980742,b0f66adc83,https://host.ru/b8b58337d272ee7b15,Russia,13fc55e781,paid,b8b58337d272ee7b15,1.2
7,1549980754,837885c8f8,https://host.ru/108ce4b365afb7b88e,Russia,cb5082b6f6,direct,108ce4b365afb7b88e,1.2
8,1549980760,af5570f5a1,https://host.ru/3004a8273caeef2867,China,45664f7af2,direct,3004a8273caeef2867,


In [115]:
# example using lambda functions
log.loc[lambda row: row.region == 'Russia'].head(9)

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source,url_path,VAT
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex,3c19b4ef7371864fa3,1.2
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct,c8d9213a31839f9a3a,1.2
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex,b8b58337d272ee7b15,1.2
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex,b8b58337d272ee7b15,1.2
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex,b8b58337d272ee7b15,1.2
5,1549980742,8855508aad,https://host.ru/df646c3676cc259fa0,Russia,fc43898e47,yandex,df646c3676cc259fa0,1.2
6,1549980742,b0f66adc83,https://host.ru/b8b58337d272ee7b15,Russia,13fc55e781,paid,b8b58337d272ee7b15,1.2
7,1549980754,837885c8f8,https://host.ru/108ce4b365afb7b88e,Russia,cb5082b6f6,direct,108ce4b365afb7b88e,1.2
9,1549980765,3e7077fd2f,https://host.ru/df646c3676cc259fa0,Russia,6f9de8c8b6,email,df646c3676cc259fa0,1.2


`loc` method efficiency

In [116]:
movies = pd.read_csv('data/movielens/movies.csv')
ratings = pd.read_csv('data/movielens/ratings.csv')
# joined = movies.merge(ratings, how='left', on='movieId')
joined = ratings.merge(movies, how='left', on='movieId')
joined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [117]:
# calculate average rating for specific genre ~ Adventure
joined['adventure'] = joined.apply(lambda row: row.rating if 'Adventure' in row.genres else None, axis=1)
joined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,adventure
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,


In [118]:
joined.adventure.mean()

3.5086089151939075

In [119]:
# the same way but using loc
joined['adventure'] = joined.loc[joined.genres.str.contains('Adventure'), 'rating']
joined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,adventure
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,


In [120]:
# test both approaches for multiple genres
genres = ['Adventure', 'Animation', 'Children', 'Drama', 'Musical', 'Thriller']

In [121]:
%%time
joined = ratings.merge(movies, how='left', on='movieId')

for genre in genres:
    joined[genre] = joined.apply(lambda row: row.rating if genre in row.genres else None, axis=1)
joined.rating.mean()

CPU times: total: 7.72 s
Wall time: 7.75 s


3.501556983616962

In [122]:
%%time
joined = ratings.merge(movies, how='left', on='movieId')

for genre in genres:
    joined[genre] = joined.loc[joined.genres.str.contains(genre), 'rating']
joined.rating.mean()

CPU times: total: 234 ms
Wall time: 235 ms


3.501556983616962

In [123]:
%%time
joined = ratings.merge(movies, how='left', on='movieId')

filtered = joined[joined.genres.isin(genres)]
filtered.rating.mean()

CPU times: total: 31.2 ms
Wall time: 36.9 ms


3.660826210826211

Exercise

Create a new column 'traffic_type', in which for 'yandex' & 'google' traffic sources will be put the value 'organic', othrewise - NaN.

In [124]:
log = pd.read_csv('data/test/visit_log.csv', sep=';')
log.head()

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex


In [125]:
applicable_sources = ('yandex', 'google')

log.loc[log.traffic_source.isin(applicable_sources), 'traffic_type'] = 'organic'
log.loc[pd.isnull(log.traffic_type), 'traffic_type'] = 'other'
log.head()

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source,traffic_type
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex,organic
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct,other
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex,organic
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex,organic
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex,organic


Defining empty row with `isnull` & `isna`

In [126]:
import numpy as np

In [127]:
df = pd.DataFrame({'value': [123, None, np.nan, np.NaN, np.NAN, 456]})
df

Unnamed: 0,value
0,123.0
1,
2,
3,
4,
5,456.0


In [128]:
# get emtpy values
df.loc[pd.isnull(df.value), :]

Unnamed: 0,value
1,
2,
3,
4,


In [129]:
df.fillna(0)

Unnamed: 0,value
0,123.0
1,0.0
2,0.0
3,0.0
4,0.0
5,456.0


Date and time in pandas

In [130]:
log['date'] = pd.to_datetime(log['timestamp'], unit='s')
log.head()

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source,traffic_type,date
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex,organic,2019-02-12 14:11:32
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct,other,2019-02-12 14:11:44
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex,organic,2019-02-12 14:11:55
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex,organic,2019-02-12 14:12:05
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex,organic,2019-02-12 14:12:16


In [131]:
log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18938 entries, 0 to 18937
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   timestamp       18938 non-null  int64         
 1   visit_id        18938 non-null  object        
 2   url             18938 non-null  object        
 3   region          18938 non-null  object        
 4   user_id         18938 non-null  object        
 5   traffic_source  18938 non-null  object        
 6   traffic_type    18938 non-null  object        
 7   date            18938 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 1.2+ MB


In [137]:
log['hour'] = log.date.dt.hour
log['weekday'] = log.date.dt.weekday
log.head()

Unnamed: 0,timestamp,visit_id,url,region,user_id,traffic_source,traffic_type,date,hour,weekday,strftime
0,1549980692,e3b0c44298,https://host.ru/3c19b4ef7371864fa3,Russia,b1613cc09f,yandex,organic,2019-02-12 14:11:32,14,1,<bound method PandasDelegate._add_delegate_acc...
1,1549980704,6e340b9cff,https://host.ru/c8d9213a31839f9a3a,Russia,4c3ec14bee,direct,other,2019-02-12 14:11:44,14,1,<bound method PandasDelegate._add_delegate_acc...
2,1549980715,96a296d224,https://host.ru/b8b58337d272ee7b15,Russia,a8c40697fb,yandex,organic,2019-02-12 14:11:55,14,1,<bound method PandasDelegate._add_delegate_acc...
3,1549980725,709e80c884,https://host.ru/b8b58337d272ee7b15,Russia,521ac1d6a0,yandex,organic,2019-02-12 14:12:05,14,1,<bound method PandasDelegate._add_delegate_acc...
4,1549980736,df3f619804,https://host.ru/b8b58337d272ee7b15,Russia,d7323c571c,yandex,organic,2019-02-12 14:12:16,14,1,<bound method PandasDelegate._add_delegate_acc...


String methods

In [139]:
stats = pd.read_csv('data/test/keywords.csv')
stats.head()

Unnamed: 0,keyword,shows
0,вк,64292779
1,одноклассники,63810309
2,порно,41747114
3,ютуб,39995567
4,вконтакте,21014195


In [145]:
import re
stats[stats.keyword.str.contains('VK', flags=re.IGNORECASE)].head()

Unnamed: 0,keyword,shows
22,vk,4330448
45,vk com,2469370
1198,vkontakte ru вход,159454
2363,vk моя страница,89944
2411,m vk com,88672


[Docs](https://www.geeksforgeeks.org/python-pandas-series-str-contains/)

Syntax: Series.str.contains(pat, case=True, flags=0, na=nan, regex=True)

Parameter :
- pat : Character sequence or regular expression.
- case : If True, case sensitive.
- flags : Flags to pass through to the re module, e.g. re.IGNORECASE.
- na : Fill value for missing values.
- regex : If True, assumes the pat is a regular expression.

In [153]:
serial = stats[stats.keyword.str.contains('сериалы')]
serial.keyword.str.replace('сереал', 'сериал').head()

246                              сериалы
304                          сериалы тут
555    турецкие сериалы на русском языке
881                      русские сериалы
890                       сериалы онлайн
Name: keyword, dtype: object

In [154]:
serial.keyword.str.upper().head()

246                              СЕРИАЛЫ
304                          СЕРИАЛЫ ТУТ
555    ТУРЕЦКИЕ СЕРИАЛЫ НА РУССКОМ ЯЗЫКЕ
881                      РУССКИЕ СЕРИАЛЫ
890                       СЕРИАЛЫ ОНЛАЙН
Name: keyword, dtype: object

In [155]:
serial.keyword.str.lower().head()

246                              сериалы
304                          сериалы тут
555    турецкие сериалы на русском языке
881                      русские сериалы
890                       сериалы онлайн
Name: keyword, dtype: object

In [156]:
# We cannot use this keyword because we can accidentally get 'рубленая котлетка
stats[stats.keyword.str.contains('рубл')].head()

Unnamed: 0,keyword,shows
1202,алиэкспресс на русском в рублях официальный сайт,161553
1602,курс гривны к рублю,125076
1736,али экспресс русская версия на русском в рубля...,117260
2132,доллары в рубли,114173
2172,100 долларов в рублях,97534


In [157]:
from pymystem3 import Mystem

In [168]:
search = 'курс гривны к рублю рубли рублях однушка однушки'

In [169]:
m = Mystem()
lemmas = m.lemmatize(search)
lemmas

['курс',
 ' ',
 'гривна',
 ' ',
 'к',
 ' ',
 'рубль',
 ' ',
 'рубль',
 ' ',
 'рубль',
 ' ',
 'однушка',
 ' ',
 'однушка',
 '\n']

In [170]:
' '.join(lemmas).strip()

'курс   гривна   к   рубль   рубль   рубль   однушка   однушка'

For other languages use `pymorphy` or `nltk`