In [1]:
import pandas as pd
import numpy as np

## Series

Она представляет из себя объект, похожий на одномерный массив, но отличительной чертой является наличие индексов. Индекс находится слева, а сам элемент справа.

Синтаксис создания:

pandas.Series(input_data, index, data_type)

input_data: ввод в виде списка, константы, массива NumPy, Dict и т. д.
index: значения индексов.

data_type (опционально): тип данных.

In [5]:
a = pd.Series([4, 7, 6, 3, 9],
              index=['one', 'two', 'three', 'four', 'five']) # если не указать, то пбудут по дефолту 1,2,3... должны совпадать по чеслу значений
a

one      4
two      7
three    6
four     3
five     9
dtype: int64

In [3]:
a = pd.Series([4, 7, 6, 3, 9])
a

0    4
1    7
2    6
3    3
4    9
dtype: int64

In [6]:
a.index #обращение по индексам

Index(['one', 'two', 'three', 'four', 'five'], dtype='object')

In [7]:
a.values #обращение по значениям

array([4, 7, 6, 3, 9])

In [9]:
a[0], a[1], a['one'] #обращение к индексу

(4, 7, 4)

## DataFrame

Объект DataFrame является табличной структурой данных. В любой таблице всегда присутствуют строки и столбцы. При этом в столбцах можно хранить данные разных типов данных. Столбцами в объекте DataFrame выступают объекты Series, строки которых являются их элементами.

Синтаксис создания:

pandas.DataFrame(input_data, index)

input_data: ввод в виде Dict, 2D массива NumPy, Series и т. д.\

index: значения индексов.


In [10]:
df = pd.DataFrame({
    'Age': [46, 37, 44, 42, 42],
    'Country': ['Spain', 'Spain', 'Germany', 'Germany', 'France'],
    'Gender': ['Female', 'Female', 'Male', 'Male', 'Male']
})

df

Unnamed: 0,Age,Country,Gender
0,46,Spain,Female
1,37,Spain,Female
2,44,Germany,Male
3,42,Germany,Male
4,42,France,Male


In [11]:
df['Age'] #обращение к датафрейму выводит серию, датафрейм это совокупность серий пандас

0    46
1    37
2    44
3    42
4    42
Name: Age, dtype: int64

In [12]:
df.Country #обращение к атрибуту как к атрибуту класса

0      Spain
1      Spain
2    Germany
3    Germany
4     France
Name: Country, dtype: object

In [13]:
df[['Country', 'Age']] #на выходе датафрейм


Unnamed: 0,Country,Age
0,Spain,46
1,Spain,37
2,Germany,44
3,Germany,42
4,France,42


In [14]:
df.columns #обращение к колонкам

Index(['Age', 'Country', 'Gender'], dtype='object')

In [15]:
df.index#обращение к индексам

RangeIndex(start=0, stop=5, step=1)

In [16]:
df = pd.DataFrame({
    'Age': [46, 37, 44, 42, 42],
    'Country': ['Spain', 'Spain', 'Germany', 'Germany', 'France'],
    'Gender': ['Female', 'Female', 'Male', 'Male', 'Male']
}, index=[5, 4, 6, 3, 2])  # можем сами задать индексы

df

Unnamed: 0,Age,Country,Gender
5,46,Spain,Female
4,37,Spain,Female
6,44,Germany,Male
3,42,Germany,Male
2,42,France,Male


In [17]:
df.index = [101, 102, 103, 104, 105] # можем поменять индексы на другие
df

Unnamed: 0,Age,Country,Gender
101,46,Spain,Female
102,37,Spain,Female
103,44,Germany,Male
104,42,Germany,Male
105,42,France,Male


## Считывание данных

В целом, pandas поддерживает все самые популярные форматы хранения данных: csv, excel, sql, html и многое другое, но чаще всего приходится работать именно с csv файлами (comma separated values).

Будем работать с датасетом по оттоку клиентов из банка https://www.kaggle.com/datasets/shubh0799/churn-modelling.

Характеристики каждого клиента:

RowNumber - Номер строки
CustomerId - Уникальный идентификатор клиента
Surname - Фамилия клиента
CreditScore - Кредитная оценка клиента
Geography - Из какой страны клиент
Gender - Пол клиента
Age - Возраст клиента
Tenure - Сколько лет человек является клиентом банка
Balance - Баланс счета
NumOfProducts - Количество открытых продуктов
HasCrCard - Есть ли у клиента кредитная карта
IsActiveMember - Является ли клиент активные участником
EstimatedSalary - Предположительная зарплата клиента
Exited - Уйдет ли человек в отток

In [20]:
df = pd.read_csv('./Churn_Modelling.csv') #может считать ексель и sql
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [19]:
pd.read_csv('./Churn_Modelling.csv', header=1) # header=1 если не нулевая строчка заголовок,

Unnamed: 0,1,15634602,Hargrave,619,France,Female,42,2,0,1.1,1.2,1.3,101348.88,1.4
0,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
1,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
2,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
3,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
4,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9994,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9995,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9996,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9997,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [21]:
pd.read_csv('./Churn_Modelling.csv', sep=';') #если разделитель sep=';', но у нас просто запяиая

Unnamed: 0,"RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited"
0,"1,15634602,Hargrave,619,France,Female,42,2,0,1..."
1,"2,15647311,Hill,608,Spain,Female,41,1,83807.86..."
2,"3,15619304,Onio,502,France,Female,42,8,159660...."
3,"4,15701354,Boni,699,France,Female,39,1,0,2,0,0..."
4,"5,15737888,Mitchell,850,Spain,Female,43,2,1255..."
...,...
9995,"9996,15606229,Obijiaku,771,France,Male,39,5,0,..."
9996,"9997,15569892,Johnstone,516,France,Male,35,10,..."
9997,"9998,15584532,Liu,709,France,Female,36,7,0,1,0..."
9998,"9999,15682355,Sabbatini,772,Germany,Male,42,3,..."


In [22]:
pd.read_csv('./Churn_Modelling.csv', sep=',')

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [23]:
df 

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [24]:
df.head() # первые пять

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [25]:
df.head(2) # первые два

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [26]:
df.tail() # аналогично, но с конца

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [27]:
df.sample() # случайная строка


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6735,6736,15717328,Hsueh,842,France,Female,37,4,132446.08,2,1,0,87071.18,1


In [29]:
df.sample(frac=1)  # вся таблича в случайном порядке, микс

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3633,3634,15629846,Sheehan,827,Germany,Female,47,8,143001.50,2,1,0,108977.50,0
4959,4960,15784361,Williamson,543,Spain,Female,46,5,140355.60,1,1,1,85086.78,0
9673,9674,15784148,Beneventi,643,France,Male,62,9,0.00,2,0,0,155870.82,0
5580,5581,15786249,Whitfield,616,Spain,Male,30,2,0.00,2,1,0,199099.51,0
6416,6417,15789379,Zetticci,762,France,Male,26,6,130428.78,1,1,0,173365.89,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2229,2230,15756125,Booth,757,Spain,Male,44,5,140856.16,2,1,0,158735.10,0
8763,8764,15665159,Brooks,727,France,Male,61,0,128213.96,2,1,1,188729.08,1
4216,4217,15718852,Uren,794,France,Male,56,9,96951.21,1,1,1,71776.76,0
501,502,15715941,Lueck,692,France,Male,54,5,0.00,2,1,1,88721.84,0


In [30]:
df.sample(frac=0.5) # 5000 rows половина таблицы в случайном порядке, микс

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
4338,4339,15798439,Davidson,714,Spain,Male,25,2,0.00,1,1,1,132979.43,0
324,325,15682757,Pardey,734,France,Male,30,3,0.00,2,1,0,107640.25,0
3040,3041,15617134,Iqbal,716,France,Male,38,4,0.00,2,1,0,189678.70,0
9071,9072,15610643,De Luca,435,Germany,Male,44,3,151739.65,1,1,0,167461.50,0
78,79,15575185,Bushell,757,Spain,Male,33,5,77253.22,1,0,1,194239.63,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2027,2028,15606613,Samson,655,France,Female,59,7,0.00,1,1,0,88958.49,1
162,163,15630910,Treacy,800,France,Female,49,7,108007.36,1,0,0,47125.11,0
9034,9035,15772337,Lawrence,723,Germany,Female,49,0,153855.52,1,1,1,180862.26,1
3183,3184,15734161,Nnonso,636,France,Male,43,6,0.00,2,1,0,43128.95,0


In [31]:
df.shape # возвращает кортеж, количество строк и столбцов

(10000, 14)

## Первичный анализ данных

Типы данных:

int: целочисленные значения. Пример: 9, 56, 30

float: вещественные значения (с плавающей точкой). Пример: 7.3, 9.0, 45.334

object/str: строковые значения. Пример: ‘hello, world’, ‘50 000’

In [32]:
df.info()  # 10000 non-null  нет пропусков


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


### Выводятся значения:

Count - количество непропущенных объектов (там, где нет nan значений)

mean - арифметическое среднее

std - стандартное отклонение

min - минимальное значение

25% - квантиль 25 процентов

50% - квантиль 50 процентов или же медиана

75% - квантиль 75 процентов

max - максимальное значение

In [33]:
df.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [34]:
df['Age'].min() # минимальный возраст

18

In [35]:
df['Balance'].max() # максимальный счет

250898.09

In [36]:
df[['CreditScore', 'Age', 'Tenure']].mean() # среднее по нескольким признакам


CreditScore    650.5288
Age             38.9218
Tenure           5.0128
dtype: float64

Получаем 4 значения:

count - количество непропущенных объектов

unique - количество уникальных значений

top - самое частотное значение (мода)

freq - частота появления самого частотного значения

In [37]:
df.describe(include=['object']) # для включения строчных значений

Unnamed: 0,Surname,Geography,Gender
count,10000,10000,10000
unique,2932,3,2
top,Smith,France,Male
freq,32,5014,5457


In [38]:
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [39]:
df['Age'].dtype

dtype('int64')

In [40]:
df['HasCrCard'].astype('bool') #замена инта на булевый тип (возвращает переделанный тип данных, но не сохраняет в исходных)

0        True
1       False
2        True
3       False
4        True
        ...  
9995     True
9996     True
9997    False
9998     True
9999     True
Name: HasCrCard, Length: 10000, dtype: bool

In [41]:
df['HasCrCard'].dtype

dtype('int64')

In [42]:
df['HasCrCard'] = df['HasCrCard'].astype('bool')  #замена инта на булевый тип (всохраняет в исходных)

In [43]:
df['HasCrCard'].dtype

dtype('bool')

In [44]:
df['Geography'].unique()#вывод уникальных

array(['France', 'Spain', 'Germany'], dtype=object)

In [45]:
df['Geography'].nunique() #вывод количества уникальных

3

In [46]:
df['Geography'].value_counts() # показывает как часто встчаютсч уникальные значения

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [None]:
df['Geography'].value_counts(normalize=True) # normalize=True долю появления уникальныз

### Фильтрация

Фильтрация в pandas основывается на булевых масках.

### Булевая маска — 
бинарные данные, которые используются для выбора определенных объектов из структуры данных.

In [47]:
df['Gender'] == 'Male' #True означет наличие признака

0       False
1       False
2       False
3       False
4       False
        ...  
9995     True
9996     True
9997    False
9998     True
9999    False
Name: Gender, Length: 10000, dtype: bool

In [48]:
male = df[df['Gender'] == 'Male'] # можно фильтровать датафрейм по признаку
male

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,True,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
8,9,15792365,He,501,France,Male,44,4,142051.07,2,False,1,74940.50,0
9,10,15592389,H?,684,France,Male,27,2,134603.88,1,True,1,71725.73,0
10,11,15767821,Bearce,528,France,Male,31,6,102016.72,2,False,0,80181.12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9992,9993,15657105,Chukwualuka,726,Spain,Male,36,2,0.00,1,True,0,195192.40,0
9993,9994,15569266,Rahman,644,France,Male,28,7,155060.41,1,True,0,29179.52,0
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0


### Логические И

При операторе & нужно, чтобы выполнялось два условия одновременно:

In [50]:

df[(df['Gender'] == 'Female') & (df['NumOfProducts'] >= 3)]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,True,0,119346.88,1
30,31,15589475,Azikiwe,591,Spain,Female,39,3,0.00,3,True,0,140469.38,1
88,89,15622897,Sharpe,646,France,Female,46,4,0.00,3,True,0,93251.42,1
90,91,15757535,Heap,647,Spain,Female,44,5,0.00,3,True,1,174205.22,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9565,9566,15752294,Long,582,France,Female,38,9,135979.01,4,True,1,76582.95,1
9747,9748,15775761,Iweobiegbunam,610,Germany,Female,69,5,86038.21,3,False,0,192743.06,1
9800,9801,15640507,Li,762,Spain,Female,35,3,119349.69,3,True,1,47114.18,1
9877,9878,15572182,Onwuamaeze,505,Germany,Female,33,3,106506.77,3,True,0,45445.78,1


### Логические ИЛИ

При операторе | нужно, чтобы выполнялось хотя бы одно условие:

In [51]:
df[(df['HasCrCard']) | (df['NumOfProducts'] >= 3)]

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,True,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,True,1,79084.10,0
5,6,15574012,Chu,645,Spain,Male,44,8,113755.78,2,True,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,9994,15569266,Rahman,644,France,Male,28,7,155060.41,1,True,0,29179.52,0
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,True,0,92888.52,1


### Логические НЕ
При операторе ~ булевая маска обращается: True меняется на False и наоборот:

In [52]:
df[~(df['Geography'] == 'Spain')] # no spain тильда, кроме одного значения нужны

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,True,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,False,0,93826.63,0
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,True,0,119346.88,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,False,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,True,0,92888.52,1


In [53]:
df[df['Geography'].isin(['France', 'Germany'])] #аналог логического НЕ

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,True,1,101348.88,1
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,True,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,False,0,93826.63,0
6,7,15592531,Bartlett,822,France,Male,50,7,0.00,2,True,1,10062.80,0
7,8,15656148,Obinna,376,Germany,Female,29,4,115046.74,4,True,0,119346.88,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,True,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,True,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,False,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,True,0,92888.52,1


## Индексация

In [55]:
df_small = df[(df['Geography'] == 'Spain')][['Geography', 'Gender', 'Age']] #география испания + три признака
df_small.head() #первые пять

Unnamed: 0,Geography,Gender,Age
1,Spain,Female,41
4,Spain,Female,43
5,Spain,Male,44
11,Spain,Male,24
14,Spain,Female,35


In [56]:
df_small.loc[1] #берем первый  индекс

Geography     Spain
Gender       Female
Age              41
Name: 1, dtype: object

In [57]:
df_small.loc[3] #такого ключа нет, ошибка KeyError: 3

KeyError: 3

In [59]:
df_small.loc[[1, 4, 5], ['Gender', 'Age']] #комбинация индкса и колонки

Unnamed: 0,Gender,Age
1,Female,41
4,Female,43
5,Male,44


### iloc элементы по порядку

In [60]:
df_small.head()

Unnamed: 0,Geography,Gender,Age
1,Spain,Female,41
4,Spain,Female,43
5,Spain,Male,44
11,Spain,Male,24
14,Spain,Female,35


In [61]:
df_small.iloc[[0, 1, 2]]

Unnamed: 0,Geography,Gender,Age
1,Spain,Female,41
4,Spain,Female,43
5,Spain,Male,44


In [62]:
df_small.iloc[2500] # за пределами массива IndexError: single positional indexer is out-of-bounds

IndexError: single positional indexer is out-of-bounds

In [63]:
df_small.iloc[0, [0, 2]] #нулевая строчка (индекс 1) и нулевой и второй столбец

Geography    Spain
Age             41
Name: 1, dtype: object

In [64]:
df_small.head()

Unnamed: 0,Geography,Gender,Age
1,Spain,Female,41
4,Spain,Female,43
5,Spain,Male,44
11,Spain,Male,24
14,Spain,Female,35


## Сортировки

In [65]:
df.sort_values('Age') #сотрировка по возрастанию

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
3512,3513,15657779,Boylan,806,Spain,Male,18,3,0.00,2,True,1,86994.54,0
1678,1679,15569178,Kharlamov,570,France,Female,18,4,82767.42,1,True,0,71811.90,0
3517,3518,15757821,Burgess,771,Spain,Male,18,1,0.00,2,False,0,41542.95,0
9520,9521,15673180,Onyekaozulu,727,Germany,Female,18,2,93816.70,2,True,0,126172.11,0
2021,2022,15795519,Vasiliev,716,Germany,Female,18,3,128743.80,1,False,0,197322.13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387,3388,15798024,Lori,537,Germany,Male,84,8,92242.34,1,True,1,186235.98,0
3033,3034,15578006,Yao,787,France,Female,85,10,0.00,2,True,1,116537.96,0
2458,2459,15813303,Rearick,513,Spain,Male,88,10,0.00,2,True,1,52952.24,0
6759,6760,15660878,T'ien,705,France,Male,92,1,126076.24,2,True,1,34436.83,0


In [66]:
df.sort_values('Age', ascending=False) #сотрировка по убыванию

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6443,6444,15764927,Rogova,753,France,Male,92,3,121513.31,1,False,1,195563.99,0
6759,6760,15660878,T'ien,705,France,Male,92,1,126076.24,2,True,1,34436.83,0
2458,2459,15813303,Rearick,513,Spain,Male,88,10,0.00,2,True,1,52952.24,0
3033,3034,15578006,Yao,787,France,Female,85,10,0.00,2,True,1,116537.96,0
3387,3388,15798024,Lori,537,Germany,Male,84,8,92242.34,1,True,1,186235.98,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9782,9783,15728829,Weigel,509,France,Male,18,7,102983.91,1,True,0,171770.58,0
2141,2142,15758372,Wallace,674,France,Male,18,7,0.00,2,True,1,55753.12,1
9501,9502,15634146,Hou,835,Germany,Male,18,2,142872.36,1,True,1,117632.63,0
9520,9521,15673180,Onyekaozulu,727,Germany,Female,18,2,93816.70,2,True,0,126172.11,0


In [67]:
df.sort_values(['Age', 'CreditScore']) #сотрировка по возрасту и по кредиту

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9782,9783,15728829,Weigel,509,France,Male,18,7,102983.91,1,True,0,171770.58,0
1678,1679,15569178,Kharlamov,570,France,Female,18,4,82767.42,1,True,0,71811.90,0
9029,9030,15722701,Bruno,594,Germany,Male,18,1,132694.73,1,True,0,167689.56,0
7334,7335,15759133,Vaguine,616,France,Male,18,6,0.00,2,True,1,27308.58,0
9526,9527,15665521,Chiazagomekpele,642,Germany,Male,18,5,111183.53,2,False,1,10063.75,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3387,3388,15798024,Lori,537,Germany,Male,84,8,92242.34,1,True,1,186235.98,0
3033,3034,15578006,Yao,787,France,Female,85,10,0.00,2,True,1,116537.96,0
2458,2459,15813303,Rearick,513,Spain,Male,88,10,0.00,2,True,1,52952.24,0
6759,6760,15660878,T'ien,705,France,Male,92,1,126076.24,2,True,1,34436.83,0
