# Reading data

In [2]:
import pandas as pd

In [3]:
df_test = pd.read_csv('test.csv', index_col=0)
df_train = pd.read_csv('train.csv', index_col=0)
df = pd.concat([df_train, df_test])

### Сделаем столбцы 'Survived', 'Pclass', 'Sex' категориями

In [4]:
columns_to_category = ['Survived', 'Pclass', 'Sex' ]
for col in columns_to_category:
    df[col] = df[col].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Survived  891 non-null    category
 1   Pclass    1309 non-null   category
 2   Name      1309 non-null   object  
 3   Sex       1309 non-null   category
 4   Age       1046 non-null   float64 
 5   SibSp     1309 non-null   int64   
 6   Parch     1309 non-null   int64   
 7   Ticket    1309 non-null   object  
 8   Fare      1308 non-null   float64 
 9   Cabin     295 non-null    object  
 10  Embarked  1307 non-null   object  
dtypes: category(3), float64(2), int64(2), object(4)
memory usage: 96.2+ KB


# 1. Анализ таблицы

a) Визуализировать базовую статистику таблицы. В каком классе было больше всего пассажиров?

In [5]:
df.describe()
df_name_groupby_class = df.groupby('Pclass').count()['Name']
print(df_name_groupby_class[df_name_groupby_class == df_name_groupby_class.max()])

Pclass
3    709
Name: Name, dtype: int64


b) Группировать таблицу в два уровня: класс и пол, по среднему значению возраста. Кто из возможных комбинаций самый юный, кто самый взрослый? Насколько отличаются эти значения?

In [6]:
age_group_mean = df.groupby(['Pclass', 'Sex'])['Age'].mean()
min_category_age = age_group_mean.min()
max_category_age = age_group_mean.max()
print("Min age with category")
print(age_group_mean[age_group_mean == min_category_age])


print("\n\nMax age with category")
print(age_group_mean[age_group_mean == max_category_age])

print(f"\n\nBetween category difference: {max_category_age-min_category_age}")

Min age with category
Pclass  Sex   
3       female    22.185329
Name: Age, dtype: float64


Max age with category
Pclass  Sex 
1       male    41.029272
Name: Age, dtype: float64


Between category difference: 18.843942575810384


c) Отобрать только выживших пассажиров с фамилией, начинающейся на “K”. Отсортировать их по убыванию стоимости билета. Кто заплатил больше всех? Кто меньше всех?

In [7]:
df['Name'] = df['Name'].astype(str)
df_k = df[df['Name'].str.startswith('K')]
df_res = df_k[df_k['Survived'] == 1]
df_sorted = df_res.sort_values(by=['Fare'], ascending=False)
print(df_sorted)
print(f"More paid:\n{df_sorted.head(1)}")
print(f"The least paid:\n{df_sorted.tail(1)}")

            Survived Pclass                                      Name     Sex  \
PassengerId                                                                     
622              1.0      1              Kimball, Mr. Edwin Nelson Jr    male   
458              1.0      1         Kenyon, Mrs. Frederick R (Marion)  female   
317              1.0      2       Kantor, Mrs. Sinai (Miriam Sternin)  female   
185              1.0      3       Kink-Heilmann, Miss. Luise Gretchen  female   
707              1.0      2             Kelly, Mrs. Florence "Fannie"  female   
692              1.0      3                        Karun, Miss. Manca  female   
304              1.0      2                       Keane, Miss. Nora A  female   
301              1.0      3  Kelly, Miss. Anna Katherine "Annie Kate"  female   
574              1.0      3                         Kelly, Miss. Mary  female   

              Age  SibSp  Parch  Ticket     Fare Cabin Embarked  
PassengerId                               

d) Какое максимальное количество родных было с выжившим пассажиром?

In [8]:
pt = df.copy(deep=True)
pt['Relatives'] = pt[['Parch', 'SibSp']].sum(axis=1)
pt.drop(columns=['Parch', 'SibSp'], inplace=True)
pk = pt.groupby(['Survived', 'Relatives'])['Relatives'].max()
res = pk.groupby('Survived').max()
res[1] # У выжившего значение - 1 ### P.S. Может по-дурацки, но нужно считать всех родственников

6.0

e) Посчитайте среднюю стоимость билета пассажиров, для которых указана каюта (Cabin) и для тех, у кого она не указана, во сколько раз они отличаются?

In [9]:
def_fare = df[df['Cabin'].notnull()]['Fare'].mean()
undef_fare = df[df['Cabin'].isnull()]['Fare'].mean()
print(f'Defined cabin mean fare is {def_fare},\nwhile for Undefined cabin mean fare is {undef_fare}.')
print(f'For defined cabin fare was {def_fare/undef_fare} times higher')

Defined cabin mean fare is 81.92899830508475,
while for Undefined cabin mean fare is 19.132707206317864.
For defined cabin fare was 4.282143526350037 times higher


# 2. Visualization

Импортируем библиотеку для визуализации https://plotly.com/python/

In [10]:
import plotly.express as px

a) Scatter plot

In [11]:
fig = px.scatter(df, x='Age', y='Fare')
fig.show()

Unsupported

b) Linear plot (with several lines)

In [12]:
df_male = df.copy(deep=True)[df.Sex == "male"]
df_female = df.copy(deep=True)[df.Sex == "female"]
df_male = df_male.groupby("Age").mean().assign(Sex="male")
df_female = df_female.groupby("Age").mean().assign(Sex="female")
df_plot = pd.concat([df_male, df_female])
fig = px.line(df_plot, y="Fare",color = "Sex")
fig.show()

Unsupported

c) Histogram

In [13]:
fig = px.histogram(df, x='Age')
fig.show()

Unsupported

d) Bar chart

In [14]:
fig = px.bar(df, x='Embarked', color="Sex", title="Distribution between man and woman boarded in different ports", barmode="group")
fig.show()

Unsupported

e) Horizontal bar chart

In [15]:
fig = px.bar(df, x='Fare', y='Pclass', orientation='h')
fig.show()

Unsupported

f) Pie chart

In [71]:
fig = px.pie(df, values='Age', names='Sex')
fig.show()

Unsupported

g) Box chart

In [72]:
fig = px.box(df, x='Pclass', y='Age', points="all")
fig.show()

Unsupported

h) Sunburst chart

In [73]:
fig = px.sunburst(df, path=['Sex', 'Pclass', 'SibSp'], values='Fare')
fig.show()

Unsupported

I) Violin

In [74]:
fig = px.violin(df, y='Fare')
fig.show()

Unsupported

J) 3D Scatter Plot

In [75]:
fig = px.scatter_3d(df, x='Age', y='Pclass', z='Parch',
              color='Fare')
fig.show()

Unsupported