In [1]:
#В заданном файле меняет каждое значение столбце с датой на
# количество дней между start_data и датой в столбце
# для первых nrows строчек
# дата задаётся в формате data_form
#Может сохранить nrows строчек начального файла в файл 
# с названием old_nrows_file. 
#total_rows используется только для расчёта времения работы функции

def replace_col_with_data_to_indices(
        oldfname, newfname,  
        col_names, goal_col_name, 
        data_form = '%d.%m.%Y',
        start_data = '01.01.2013', 
        chunksize_indexing = 10000, 
        encoding = 'utf_8', 
        skiprows = 1, header = None, 
        nrows = None, sep = ',', 
        print_info = True,
        total_rows = 2935849,
        old_nrows_file = None, 
        save_old_nrows_file = False):
    
    #Для подсчёта времени и количества дней между датами
    import datetime
    
    import pandas as pd
    import numpy as np
    
    
    #Добавление нового столбца с индексами
    new_col_names = list(col_names)
    ind_goal = new_col_names.index(goal_col_name)
    new_col_name = goal_col_name + '_ind'
    new_col_names.insert(ind_goal+1, new_col_name)
    
    if (print_info):
        print('\t\t\tStart creating newfile')
        
    start_time = datetime.datetime.now()    
        
    chunk_num = 0
    row_num = 0
    seconds_chunk = []
    
    st_data = datetime.datetime.strptime(start_data, data_form)
    
    #Второе чтение файла, изменение целевого столбца
    for chunk in pd.read_csv(oldfname, chunksize = chunksize_indexing,
                             encoding = encoding, engine='c',
                             skiprows = skiprows, 
                             header = header, nrows = nrows, 
                             sep = sep, names = col_names):
        #Отброс всех строк, которые = nan
        #chunk = (chunk.dropna(subset=[goal_col_name])).reset_index(drop=True)
        
        start_time_chunk = datetime.datetime.now()
        
        #Замена всех строк, которые = nan
        chunk[goal_col_name].replace(np.nan, 'empty', regex=True, inplace=True) 
        #chunk[goal_col_name].replace('0', '2020-01-01 00:00:00', regex=True, inplace=True) 
        #chunk[goal_col_name].replace('2', 'empty', regex=True, inplace=True) 
        
        #Сохранение nrows строчек старого файла
        if (save_old_nrows_file):
            if (chunk_num == 0):
                chunk.to_csv(old_nrows_file, encoding = encoding,
                  mode = 'w+', sep=sep)
            else:
                chunk.to_csv(old_nrows_file, encoding = encoding,
                    mode = 'a+', header = False, sep=sep) 
            
        #Изменение порядка столбцов
        chunk = chunk.loc[:,new_col_names]    
        
        if (print_info):
            print('Chunk number ', chunk_num)
            print('\tStart indexing')
            
        start_time_ind = datetime.datetime.now()
        
        #Сопоставление значения количеству дней
        indices = []
        for x in range(chunk.shape[0]):
            
            #print(row_num, '\n', chunk.iloc[row_num])
            try:
                cur_data = datetime.datetime.strptime(chunk[goal_col_name][row_num], data_form)
            except ValueError:
                cur_data = st_data - datetime.timedelta(days=1000)
            
            ind = (cur_data - st_data).days
            indices.append(int(ind))
            
            row_num += 1
            
        end_time_ind = datetime.datetime.now()    
            
        if (print_info):
            print('\tTime for indexing: ', str(end_time_ind - start_time_ind))
            print('\tStart deleting old_col and writing to newfile')
            
        #Запись вычисленных индексов в новый целевой столбец
        chunk[new_col_name] = indices
        #Удаление устаревшего целевого столбца
        chunk.drop(goal_col_name, axis=1, inplace=True)
        
        #Запись обновленного чанка в новый файл
        if (chunk_num == 0):
            chunk.to_csv(newfname, encoding = encoding,
                mode = 'w+', index = False, sep=sep)
        else:
            chunk.to_csv(newfname, encoding = encoding,
                mode = 'a+', index = False, header = False, sep=sep)  
        
        end_time_chunk = datetime.datetime.now()
        
        if (print_info):
            tot_seconds = (end_time_chunk - start_time_chunk).total_seconds()
            seconds_chunk.append(tot_seconds)
            tot_seconds = sum(seconds_chunk) / len(seconds_chunk)
            tot_seconds_to_end = int(tot_seconds*(total_rows-row_num)/chunksize_indexing)
            seconds_to_end = tot_seconds_to_end % 60
            minutes_to_end = tot_seconds_to_end // 60
            
            print('\t\tEstimated minutes to end: ', minutes_to_end)
            print('\t\t\tseconds to end: ', seconds_to_end)
            
            print('\trows written = ', row_num, ';')
        
        chunk_num += 1
     
    end_time = datetime.datetime.now()
    if (print_info):
        print('Time: ', str(end_time - start_time))
        print('END OF WRITE')

In [2]:
sales_fname = 'sales_train_v2.csv'
sales_fname_data = 'sales_train_data.csv'
sales_column_names = ['Date', 'DateBlockNum', 'ShopId', 'ItemId', 'ItemPrice', 'ItemCntDay']

In [3]:
replace_col_with_data_to_indices(sales_fname, sales_fname_data, sales_column_names, 'Date', chunksize_indexing = 200000)

			Start creating newfile


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Chunk number  0
	Start indexing
	Time for indexing:  0:00:08.845510
	Start deleting old_col and writing to newfile
		Estimated minutes to end:  2
			seconds to end:  27
	rows written =  200000 ;
Chunk number  1
	Start indexing
	Time for indexing:  0:00:10.131713
	Start deleting old_col and writing to newfile
		Estimated minutes to end:  2
			seconds to end:  25
	rows written =  400000 ;
Chunk number  2
	Start indexing
	Time for indexing:  0:00:09.877872
	Start deleting old_col and writing to newfile
		Estimated minutes to end:  2
			seconds to end:  16
	rows written =  600000 ;
Chunk number  3
	Start indexing
	Time for indexing:  0:00:08.337823
	Start deleting old_col and writing to newfile
		Estimated minutes to end:  2
			seconds to end:  0
	rows written =  800000 ;
Chunk number  4
	Start indexing
	Time for indexing:  0:00:09.702979
	Start deleting old_col and writing to newfile
		Estimated minutes to end:  1
			seconds to end:  50
	rows written =  1000000 ;
Chunk number  5
	Start in