In [2]:
import pandas as pd
#readme 
def check_columns(data, controlcolumns, ignore_extra = True):
    '''
    Signature:
        check_columns(
        data: 'DataFrame',
        controlcolumns: 'list-like'
        ignore_extra = True
        ) -> 'bool'
    Docstring:
        Проверяет соответствие столбцов в обрабатываемом файле, 
        по умолчанию игнорирует избыточные столбцы
    Parametrs
    ---------
        data: DataFrame
        controlcolumns: 'list-like'
            должны содержаться эти столбцы
        ignore_extra: bool, default True
            игнорирует избыточные столбцы
    Returns
    -------
        True|False
    '''
    set_data_columns = set(data.columns)
    set_columns = set (controlcolumns)
    if ignore_extra:
        return set_columns.issubset(set_data_columns)
    else:
        return set_columns == set_data_columns
    

In [19]:
def firstclear(df):
    '''
    Signature:
        first_clear(
        df: 'DataFrame',
        ) -> 'DataFrame'
    Docstring:
        Удаляет неинформативные данные с пустой ценой с df, возвращает удаленные в виде 
        DataFrame
    Parametrs
    ---------
        df: DataFrame
    
    Returns
    -------
        DataFrame
    '''
    deleted = df[df['price'].isna()]
    df.dropna(subset=['price'], inplace=True)
    return deleted

In [15]:
data = pd.read_excel('data/_joined-2022-09-09-10-18-Copy1.xlsx')

In [18]:
len(data)

35647

In [17]:
len(firstclear(data))

8

In [20]:
import unittest
import pandas as pd

class Test_checkcolumns(unittest.TestCase):
    def test_firstclear(self):
        data = pd.read_excel('data/_joined-2022-09-09-10-18-Copy1.xlsx')
        deleted = firstclear(data)
        self.assertEqual((len(data),len(deleted)), (35647,8))
    
    
    def test_len(self):
        data = pd.read_excel('data/_joined-2022-09-09-10-18-Copy1.xlsx')
        self.assertEqual(len(data), 35655)
    
    def test_checkcolumns_v2(self):
        data = pd.read_excel('data/_joined-2022-09-09-10-18-Copy1.xlsx').head()
        controlcolumns = {'art', 'd0', 'd1', 'filname', 'gems', 'gems2', 'gold',
 'gold2', 'h1', 'price', 'price2', 'source', 'url', 'weight'}
        self.assertEqual(check_columns(data, controlcolumns), True)
        
    def test_checkcolumns_v1(self):
        data = pd.read_excel('data/_joined-2022-09-09-10-18-Copy1.xlsx').head()
        controlcolumns = {'art', 'extracol', 'd0', 'd1', 'filname', 'gems', 'gems2', 'gold',
 'gold2', 'h1', 'price', 'price2', 'source', 'url', 'weight'}
        self.assertEqual(check_columns(data, controlcolumns), False)
        
    def test_checkcolumns_v3(self):
        data = pd.read_excel('data/_joined-2022-09-09-10-18-Copy1.xlsx').head()
        controlcolumns = {'art', 'd0', 'd1', 'filname', 'gems', 'gems2', 'gold',
 'gold2', 'h1', 'price', 'price2', 'source', 'url', 'weight'}
        self.assertEqual(check_columns(data, controlcolumns, ignore_extra=False), False)




res = unittest.main(argv=[''], verbosity=3, exit=False)

# if we want our notebook to stop processing due to failures, we need a cell itself to fail
assert len(res.result.failures) == 0


test_checkcolumns_v1 (__main__.Test_checkcolumns) ... ok
test_checkcolumns_v2 (__main__.Test_checkcolumns) ... ok
test_checkcolumns_v3 (__main__.Test_checkcolumns) ... ok
test_firstclear (__main__.Test_checkcolumns) ... ok
test_len (__main__.Test_checkcolumns) ... ok

----------------------------------------------------------------------
Ran 5 tests in 30.874s

OK


In [21]:
check_columns?